| /////////////////////////////////////////////////////////////////////////////// |
| // |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| // |
| /////////////////////////////////////////////////////////////////////////////// |
| |
| #include <iostream> |
| |
| using namespace std; |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <setjmp.h> |
| #include <signal.h> |
| #include <fcntl.h> |
| #include <netdb.h> |
| #include <sys/socket.h> |
| #include <netinet/in.h> |
| #include <netinet/tcp.h> |
| #include <sys/epoll.h> |
| #include <sys/file.h> |
| #include <sys/time.h> |
| #include <sys/resource.h> |
| #include <errno.h> |
| #include <limits.h> |
| #include <unistd.h> |
| |
| #include "localio.h" |
| #include "mlio.h" |
| #include "monlogging.h" |
| #include "monsonar.h" |
| #include "montrace.h" |
| #include "redirector.h" |
| #include "healthcheck.h" |
| #include "config.h" |
| #include "device.h" |
| #include "cluster.h" |
| #include "monitor.h" |
| |
| #include "replicate.h" |
| |
| #include "clusterconf.h" |
| #include "lnode.h" |
| #include "pnode.h" |
| #include "reqqueue.h" |
| #include "zclient.h" |
| #include "commaccept.h" |
| #include "meas.h" |
| #ifdef NAMESERVER_PROCESS |
| #include "nscommacceptmon.h" |
| #else |
| #include "nameserver.h" |
| #endif |
| |
| extern bool IAmIntegrating; |
| extern bool IAmIntegrated; |
| extern bool IsRealCluster; |
| extern bool IsAgentMode; |
| extern bool IsMaster; |
| extern bool IsMPIChild; |
| extern char MasterMonitorName[MAX_PROCESS_PATH]; |
| extern char Node_name[MPI_MAX_PROCESSOR_NAME]; |
| extern bool ZClientEnabled; |
| extern char IntegratingMonitorPort[MPI_MAX_PORT_NAME]; |
| extern char MyCommPort[MPI_MAX_PORT_NAME]; |
| extern char MyMPICommPort[MPI_MAX_PORT_NAME]; |
| extern char MySyncPort[MPI_MAX_PORT_NAME]; |
| #ifdef NAMESERVER_PROCESS |
| extern CCommAcceptMon CommAcceptMon; |
| extern char MyMon2NsPort[MPI_MAX_PORT_NAME]; |
| #else |
| extern CNameServer *NameServer; |
| extern bool NameServerEnabled; |
| extern char MyPtPPort[MPI_MAX_PORT_NAME]; |
| #endif |
| extern bool SMSIntegrating; |
| extern int CreatorShellPid; |
| extern Verifier_t CreatorShellVerifier; |
| extern CommType_t CommType; |
| |
| extern int MyPNID; |
| |
| extern CReqQueue ReqQueue; |
| |
| extern CMonitor *Monitor; |
| extern CNodeContainer *Nodes; |
| extern CConfigContainer *Config; |
| #ifndef NAMESERVER_PROCESS |
| extern CDeviceContainer *Devices; |
| #endif |
| extern CNode *MyNode; |
| extern CMonStats *MonStats; |
| #ifndef NAMESERVER_PROCESS |
| extern CRedirector Redirector; |
| #endif |
| extern CMonLog *MonLog; |
| extern CHealthCheck HealthCheck; |
| extern CCommAccept CommAccept; |
| extern CZClient *ZClient; |
| extern CMeas Meas; |
| |
| extern long next_test_delay; |
| extern CReplicate Replicator; |
| |
| extern char *ErrorMsg (int error_code); |
| |
| extern const char *ProcessTypeString( PROCESSTYPE type ); |
| |
| const char *JoiningPhaseString( JOINING_PHASE phase); |
| const char *StateString( STATE state); |
| #ifndef NAMESERVER_PROCESS |
| const char *SyncStateString( SyncState state); |
| #endif |
| const char *EpollEventString( __uint32_t events ); |
| const char *EpollOpString( int op ); |
| const char *NodePhaseString( NodePhase phase ); |
| #ifdef NAMESERVER_PROCESS |
| #define MPI_Abort(a,b) abort() |
| #endif |
| |
| const char *NodePhaseString( NodePhase phase ) |
| { |
| const char *str; |
| |
| switch( phase ) |
| { |
| case Phase_Ready: |
| str = "Phase_Ready"; |
| break; |
| case Phase_Activating: |
| str = "Phase_Activating"; |
| break; |
| case Phase_SoftDown: |
| str = "Phase_SoftDown"; |
| break; |
| case Phase_SoftUp: |
| str = "Phase_SoftUp"; |
| break; |
| default: |
| str = "NodePhase - Undefined"; |
| break; |
| } |
| |
| return( str ); |
| } |
| |
| void CCluster::ActivateSpare( CNode *spareNode, CNode *downNode, bool checkHealth ) |
| { |
| const char method_name[] = "CCluster::ActivateSpare"; |
| TRACE_ENTRY; |
| // if not checking health, assume the spare is healthy |
| bool spareHealthy = checkHealth ? false : true; |
| int tmCount = 0; |
| CNode *node; |
| CLNode *lnode; |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d - pnid=%d, name=%s (%s) is taking over pnid=%d, name=%s (%s), check health=%d, isIntegrating=%d , integrating pnid=%d\n" |
| , method_name, __LINE__ |
| , spareNode->GetPNid(), spareNode->GetName(), StateString(spareNode->GetState()) |
| , downNode->GetPNid(), downNode->GetName(), StateString(downNode->GetState()) |
| , checkHealth, IsIntegrating(), integratingPNid_ ); |
| } |
| |
| if ( checkHealth ) |
| { |
| // TODO: Execute physical node health check script here |
| spareHealthy = true; |
| if ( !spareHealthy ) |
| { |
| // and tell the cluster the node is down, since the spare can't takeover |
| CReplNodeDown *repl = new CReplNodeDown(downNode->GetPNid()); |
| Replicator.addItem(repl); |
| } |
| } |
| |
| if ( spareHealthy ) |
| { |
| if ( downNode->GetPNid() != spareNode->GetPNid() ) |
| { |
| // Move down node's logical nodes to spare node |
| downNode->MoveLNodes( spareNode ); |
| |
| spareNode->SetPhase( Phase_Activating ); |
| |
| Nodes->AddToSpareNodesList( downNode->GetPNid() ); |
| |
| if ( !IsIntegrating() ) |
| { |
| downNode->SetState( State_Down ); |
| |
| // Send process death notices |
| #ifndef NAMESERVER_PROCESS |
| spareNode->KillAllDown(); |
| #endif |
| |
| // Send node down notice |
| lnode = spareNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| // Watchdog process clone was removed in KillAllDown |
| lnode->Down(); |
| } |
| } |
| } |
| |
| // Any DTMs running? |
| for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ ) |
| { |
| node = Nodes->GetNodeByMap( i ); |
| lnode = node->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *process = lnode->GetProcessLByType( ProcessType_DTM ); |
| if ( process ) tmCount++; |
| } |
| } |
| |
| // Create Watchdog and PSD processes if this node is the activating spare |
| if ( spareNode->GetPNid() == MyPNID ) |
| { |
| #ifndef NAMESERVER_PROCESS |
| Monitor->StartPrimitiveProcesses(); |
| #endif |
| } |
| else |
| { |
| // Check for end of joining phase on node re-integration |
| if ( spareNode->GetState() == State_Joining ) |
| { |
| spareNode->SetState( State_Up ); |
| } |
| #ifndef NAMESERVER_PROCESS |
| if ( tmCount ) |
| { |
| // Send node prepare notice to local DTM processes |
| lnode = spareNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->PrepareForTransactions( downNode->GetPNid() != spareNode->GetPNid() ); |
| } |
| } |
| #else |
| ResetIntegratingPNid(); |
| #endif |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( downNode->GetPNid() != spareNode->GetPNid() ) |
| { |
| // we need to abort any active TmSync |
| if (( MyNode->GetTmSyncState() == SyncState_Start ) || |
| ( MyNode->GetTmSyncState() == SyncState_Continue ) || |
| ( MyNode->GetTmSyncState() == SyncState_Commit ) ) |
| { |
| MyNode->SetTmSyncState( SyncState_Abort ); |
| Monitor->SetAbortPendingTmSync(); |
| if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d" " - Node " "%d" " TmSyncState updated (" "%d" ")" "\n", method_name, __LINE__, MyPNID, MyNode->GetTmSyncState()); |
| } |
| } |
| #endif |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d - Spare node activating! pnid=%d, name=(%s)\n" |
| , method_name, __LINE__ |
| , spareNode->GetPNid(), spareNode->GetName()); |
| } |
| } |
| |
| if ( spareNode->GetPNid() == MyPNID && spareHealthy ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Replicating activate spare node pnid=%d, name=%s (%s), spare=%d, down pnid=%d, name=%s (%s), DTM count=%d\n" |
| , method_name, __LINE__ |
| , spareNode->GetPNid(), spareNode->GetName(), StateString(spareNode->GetState()) |
| , spareNode->IsSpareNode() |
| , downNode->GetPNid(), downNode->GetName(), StateString(downNode->GetState()) |
| , tmCount ); |
| // Let other monitors know is ok to activate this spare node |
| CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, downNode->GetPNid() ); |
| Replicator.addItem(repl); |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( !tmCount ) |
| { |
| // No DTMs in environment so implicitly make ready for transactions |
| lnode = MyNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| ReqQueue.enqueueTmReadyReq( lnode->GetNid() ); |
| } |
| } |
| #endif |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| void CCluster::NodeTmReady( int nid ) |
| { |
| const char method_name[] = "CCluster::NodeTmReady"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d - nid=%d\n", method_name, __LINE__, nid ); |
| } |
| |
| tmReadyCount_++; |
| |
| if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, soft node down=%d, LNodesCount=%d\n" |
| , method_name, __LINE__ |
| , nid |
| , tmReadyCount_ |
| , MyNode->IsSoftNodeDown() |
| , MyNode->GetLNodesCount() ); |
| } |
| |
| MyNode->StartPStartDPersistentDTM( nid ); |
| |
| if ( MyNode->GetLNodesCount() == tmReadyCount_ ) |
| { |
| if ( MyNode->IsSoftNodeDown() ) |
| { |
| MyNode->ResetSoftNodeDown(); |
| |
| MyNode->SetPhase( Phase_Ready ); |
| |
| char la_buf[MON_STRING_BUF_SIZE]; |
| sprintf( la_buf, "[%s], Soft Node up! pnid=%d, name=(%s)\n" |
| , method_name, MyNode->GetPNid(), MyNode->GetName()); |
| mon_log_write(MON_CLUSTER_NODE_TM_READY_1, SQ_LOG_INFO, la_buf); |
| } |
| else |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| sprintf(la_buf, "[%s], Node activated! pnid=%d, name=(%s) \n", method_name, MyNode->GetPNid(), MyNode->GetName()); |
| mon_log_write(MON_CLUSTER_NODE_TM_READY_2, SQ_LOG_INFO, la_buf); |
| |
| // Let other monitors know the node is up |
| CReplActivateSpare *repl = new CReplActivateSpare( MyPNID, -1 ); |
| Replicator.addItem(repl); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CCluster::NodeReady( CNode *spareNode ) |
| { |
| const char method_name[] = "CCluster::NodeReady"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d - spare node %s pnid=%d\n" |
| , method_name, __LINE__, spareNode->GetName(), spareNode->GetPNid() ); |
| } |
| |
| assert( spareNode->GetState() == State_Up ); |
| |
| // Send node up notice |
| CLNode *lnode = spareNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->Up(); |
| } |
| |
| spareNode->SetActivatingSpare( false ); |
| ResetIntegratingPNid(); |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::UpdateMonitorPort (const char* newMaster) |
| { |
| const char method_name[] = "CCluster::UpdateMonitorPort"; |
| TRACE_ENTRY; |
| |
| char *monitorPort = getenv ("MONITOR_COMM_PORT"); |
| if ((monitorPort) && (newMaster)) |
| { |
| strcpy( IntegratingMonitorPort, newMaster ); |
| strcat( IntegratingMonitorPort, ":"); |
| strcat( IntegratingMonitorPort, monitorPort); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) UpdateMonitorPort Updating IntegratingMonitorPort to %s\n", |
| method_name, __LINE__,IntegratingMonitorPort ); |
| } |
| } |
| TRACE_EXIT; |
| } |
| |
| // Assign leaders as required |
| // Current leaders are TM Leader and Monitor Leader |
| void CCluster::AssignLeaders( int pnid, const char* failedMaster, bool checkProcess ) |
| { |
| const char method_name[] = "CCluster::AssignLeaders"; |
| TRACE_ENTRY; |
| |
| #ifndef NAMESERVER_PROCESS |
| AssignTmLeader ( pnid, checkProcess ); |
| #else |
| pnid = pnid; |
| checkProcess = checkProcess; |
| #endif |
| AssignMonitorLeader ( failedMaster ); |
| |
| TRACE_EXIT; |
| } |
| |
| // Assign monitor lead in the case of failure |
| void CCluster::AssignMonitorLeader( const char* failedMaster ) |
| { |
| const char method_name[] = "CCluster::AssignMonitorLeader"; |
| TRACE_ENTRY; |
| |
| int i = 0; |
| int rc = 0; |
| |
| int monitorLeaderPNid = -1; |
| CNode *node = NULL; |
| |
| if (failedMaster == NULL) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) failedMaster is NULL, returning\n" , method_name, __LINE__); |
| } |
| TRACE_EXIT; |
| return; |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) " " MonitorLeader (%s) failed!\n" |
| , method_name, __LINE__, failedMaster ); |
| } |
| |
| if (!IsAgentMode || !ZClientEnabled) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) not AgentMode or zookeeper not enabled, returning\n" |
| , method_name, __LINE__); |
| } |
| TRACE_EXIT; |
| return; |
| } |
| // delete old master if needed |
| const char *masterMonitor = ZClient->WaitForAndReturnMaster (false); |
| if (masterMonitor) |
| { |
| // IFF it is the failed master, delete, do not delete anything else because we could delete a new master |
| if (strcmp (masterMonitor, failedMaster) == 0) |
| { |
| ZClient->WatchNodeMasterDelete (failedMaster); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) deleting master %s\n" |
| , method_name, __LINE__, masterMonitor ); |
| } |
| } |
| // no worries |
| else |
| { |
| rc = ZClient->WatchMasterNode( masterMonitor ); |
| UpdateMonitorPort ( masterMonitor ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) master did not match, set watch (rc = %d) and returning %s\n" |
| , method_name, __LINE__, rc, masterMonitor ); |
| } |
| TRACE_EXIT; |
| return; |
| } |
| } |
| |
| // choose a new master |
| if (((MyNode) && ((MyNode->GetState() != State_Up) ||(!IAmIntegrated))) || (MyNode == NULL /* not set up yet*/)) |
| { |
| // Do not let this monitor participate in choosing the master. It can wait until an integrated |
| // monitor makes a decision. |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - (MasterMonitor) This Node is not set up yet and will not participate in master choice!\n" |
| , method_name, __LINE__); |
| } |
| |
| // wait until another monitor choose a master |
| const char *masterMonitor = ZClient->WaitForAndReturnMaster (true); |
| if (masterMonitor) |
| { |
| rc = ZClient->WatchMasterNode( masterMonitor ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc); |
| } |
| |
| UpdateMonitorPort ( masterMonitor ); |
| } |
| TRACE_EXIT; |
| return; |
| } |
| |
| // For all monitors who are up - choose the master using the same logic |
| for (i=0; i<GetConfigPNodesMax(); i++) |
| { |
| monitorLeaderPNid++; // set to -1, so this will bump it to 0 on the first time through |
| |
| if (monitorLeaderPNid == GetConfigPNodesMax()) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader Unable to create or set watch\n", method_name, __LINE__); |
| } |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Unable to create or set watch on master, hit max\n" |
| , method_name ); |
| mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_1, SQ_LOG_ERR, buf); |
| break; |
| } |
| |
| if (Node[monitorLeaderPNid] == NULL) |
| { |
| continue; |
| } |
| |
| node = Node[monitorLeaderPNid]; |
| |
| // skip this node |
| if ( node == NULL ) |
| { |
| continue; |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown()); |
| } |
| |
| if ( node->IsSpareNode() || |
| node->IsSoftNodeDown() || |
| node->GetState() != State_Up || |
| node->GetPhase() != Phase_Ready ) |
| { |
| continue; // skip this node for any of the above reasons |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " - Node " "%d" " is the new monitorLeaderPNid." "\n", method_name, __LINE__, node->GetPNid()); |
| } |
| |
| const char *masterMonitor = ZClient->WaitForAndReturnMaster (false); |
| |
| //nobody has written it yet, we don't want to overwrite anything |
| if (!masterMonitor) |
| { |
| rc = ZClient->CreateMasterZNode ( node->GetName() ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader CreateMasterZNode with rc = %d\n", method_name, __LINE__, rc); |
| } |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Master Monitor is %s on node %d\n" |
| , method_name, node->GetName(), node->GetPNid() ); |
| mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_2, SQ_LOG_INFO, buf); |
| |
| if ( (rc == ZOK) || (rc == ZNODEEXISTS) ) |
| { |
| rc = ZClient->WatchMasterNode( node->GetName() ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc); |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader Unable to create or set watch\n", method_name, __LINE__); |
| } |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Unable to create or set watch on master node %s\n" |
| , method_name, node->GetName() ); |
| mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_3, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| rc = ZClient->WatchMasterNode( masterMonitor ); |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Master Monitor is %s\n" |
| , method_name, masterMonitor); |
| mon_log_write(MON_CLUSTER_ASSIGNMONITORLEADER_4, SQ_LOG_INFO, buf); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc); |
| } |
| } |
| |
| break; |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| // Assigns a new TMLeader if given pnid is same as tmLeaderNid_ |
| // TmLeader is a logical node num. |
| // pnid has gone down, so if that node was previously the TM leader, a new one needs to be chosen. |
| void CCluster::AssignTmLeader( int pnid, bool checkProcess ) |
| { |
| const char method_name[] = "CCluster::AssignTmLeader"; |
| TRACE_ENTRY; |
| |
| int i = 0; |
| CNode *node = NULL; |
| CProcess *process = NULL; |
| |
| int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid(); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - pnid=%d, checkProcess=%d, tmLeaderNid_=%d, TmLeaderPNid=%d\n" |
| , method_name, __LINE__ |
| , pnid, checkProcess, tmLeaderNid_, TmLeaderPNid ); |
| } |
| |
| if (TmLeaderPNid != pnid) |
| { |
| node = LNode[tmLeaderNid_]->GetNode(); |
| |
| if (checkProcess) |
| { |
| process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM ); |
| if (process) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| if (node) |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, " |
| "isSoftNodeDown=%d, checkProcess=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown() |
| , checkProcess ); |
| } |
| return; |
| } |
| else |
| { |
| if (NameServerEnabled) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Getting process from Name Server, nid=%d, type=%s\n" |
| , method_name, __LINE__ |
| , tmLeaderNid_, ProcessTypeString(ProcessType_DTM) ); |
| } |
| |
| process = Nodes->GetProcessLByTypeNs( tmLeaderNid_, ProcessType_DTM ); |
| if (process) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| if (node) |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, " |
| "isSoftNodeDown=%d, checkProcess=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown() |
| , checkProcess ); |
| } |
| return; |
| } |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| if (node) |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, " |
| "isSoftNodeDown=%d, checkProcess=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown() |
| , checkProcess ); |
| } |
| return; |
| } |
| } |
| |
| node = Node[TmLeaderPNid]; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d" " - Node " "%d" " TmLeader failed! (checkProcess=%d)\n" |
| , method_name, __LINE__, tmLeaderNid_, checkProcess ); |
| } |
| |
| for (i=0; i<GetConfigPNodesMax(); i++) |
| { |
| TmLeaderPNid++; |
| |
| if (TmLeaderPNid == GetConfigPNodesMax()) |
| { |
| TmLeaderPNid = 0; // restart with nid 0 |
| } |
| |
| if (TmLeaderPNid == pnid) |
| { |
| continue; // this is the node that is going down, skip it |
| } |
| |
| if (Node[TmLeaderPNid] == NULL) |
| { |
| continue; |
| } |
| |
| node = Node[TmLeaderPNid]; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown()); |
| } |
| |
| if ( node->IsSpareNode() || |
| node->IsSoftNodeDown() || |
| node->GetState() != State_Up || |
| node->GetPhase() != Phase_Ready ) |
| { |
| continue; // skip this node for any of the above reasons |
| } |
| |
| tmLeaderNid_ = node->GetFirstLNode()->GetNid(); |
| |
| if (checkProcess) |
| { |
| process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM ); |
| if (!process) |
| { |
| continue; // skip this node no DTM process exists |
| } |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d" " - Node " "%d" " is the new TmLeader." "\n", method_name, __LINE__, tmLeaderNid_); |
| } |
| |
| break; |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| |
| CCluster::CCluster (void) |
| :NumRanks (-1) |
| ,socks_(NULL) |
| ,sockPorts_(NULL) |
| ,commSock_(-1) |
| ,syncPort_(0) |
| ,syncSock_(-1) |
| #ifdef NAMESERVER_PROCESS |
| ,mon2nsSock_(-1) |
| #endif |
| ,epollFD_(-1), |
| Node (NULL), |
| LNode (NULL), |
| tmSyncPNid_ (-1), |
| currentNodes_ (0), |
| configPNodesCount_ (-1), |
| configPNodesMax_ (-1), |
| nodeMap_ (NULL), |
| #ifndef NAMESERVER_PROCESS |
| tmLeaderNid_ (-1), |
| tmReadyCount_(0), |
| #endif |
| minRecvCount_(4096), |
| recvBuffer_(NULL), |
| recvBuffer2_(NULL), |
| swpRecCount_(0), |
| barrierCount_(0), |
| allGatherCount_(0), |
| commDupCount_(0), |
| barrierCountSaved_(0), |
| allGatherCountSaved_(0), |
| commDupCountSaved_(0), |
| inBarrier_(false), |
| inAllGather_(false), |
| inCommDup_(false), |
| monInitComplete_(false), |
| monSyncResponsive_(true), |
| integratingPNid_(-1), |
| joinComm_(MPI_COMM_NULL), |
| joinSock_(-1), |
| lastSeqNum_(0), |
| lowSeqNum_(0), |
| highSeqNum_(0), |
| reconnectSeqNum_(0), |
| seqNum_(1), |
| waitForWatchdogExit_(false) |
| ,waitForNameServerExit_(false) |
| ,checkSeqNum_(false) |
| ,validateNodeDown_(false) |
| ,enqueuedDown_(false) |
| ,nodeDownDeathNotices_(true) |
| ,verifierNum_(0) |
| #ifdef NAMESERVER_PROCESS |
| ,myMonConnCount_(0) |
| ,minMonConnCount_(0) |
| ,minMonConnPnid_(-1) |
| #else |
| ,clusterProcCount_(0) |
| #endif |
| { |
| int i; |
| const char method_name[] = "CCluster::CCluster"; |
| TRACE_ENTRY; |
| |
| configMaster_ = -1; |
| MPI_Comm_set_errhandler(MPI_COMM_WORLD,MPI_ERRORS_RETURN); |
| |
| char *env = getenv("SQ_MON_CHECK_SEQNUM"); |
| if ( env ) |
| { |
| int val = atoi(env); |
| if ( val > 0) |
| { |
| checkSeqNum_ = (val != 0); |
| } |
| } |
| |
| if (trace_settings & TRACE_INIT) |
| trace_printf("%s@%d Checking sync sequence numbers is %s\n", |
| method_name, __LINE__, |
| (checkSeqNum_ ? "enabled" : "disabled")); |
| |
| CClusterConfig *clusterConfig = Nodes->GetClusterConfig(); |
| configPNodesMax_ = clusterConfig->GetPNodesConfigMax(); |
| |
| // get master from CClusterConfig |
| configMaster_ = clusterConfig->GetConfigMaster(); |
| |
| // Compute minimum "sync cycles" per second. The minimum is 1/10 |
| // the expected number, assuming "next_test_delay" cycles per second (where |
| // next_test_delay is in microseconds). |
| syncMinPerSec_ = 1000000 / next_test_delay / 10; |
| |
| agMaxElapsed_.tv_sec = 0; |
| agMaxElapsed_.tv_nsec = 0; |
| agMinElapsed_.tv_sec = 10000; |
| agMinElapsed_.tv_nsec = 0; |
| |
| // Allocate structures for monitor point-to-point communications |
| // |
| // The current approach is to allocate to a maximum number (MAX_NODES). |
| // |
| // The actual number could be based on the number of nodes configured |
| // which is better from a memory allocation perspective. However, |
| // this requires changing to an index-to-pnid map structure to access |
| // physical node objects (CNode) in the array structures and managing |
| // the map as nodes are added and deleted. (an optimization task) |
| // |
| comms_ = new MPI_Comm[MAX_NODES]; |
| otherMonRank_ = new int[MAX_NODES]; |
| socks_ = new int[MAX_NODES]; |
| sockPorts_ = new int[MAX_NODES]; |
| |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| upNodes_.upNodes[i] = 0; |
| } |
| |
| for (i=0; i < MAX_NODES; ++i) |
| { |
| comms_[i] = MPI_COMM_NULL; |
| socks_[i] = -1; |
| sockPorts_[i] = -1; |
| } |
| |
| env = getenv("SQ_MON_NODE_DOWN_VALIDATION"); |
| if ( env ) |
| { |
| int val = atoi(env); |
| if ( val > 0) |
| { |
| validateNodeDown_ = (val != 0); |
| } |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Validation of node down is %s\n", |
| method_name, (validateNodeDown_ ? "enabled" : "disabled")); |
| mon_log_write(MON_CLUSTER_CLUSTER_1, SQ_LOG_INFO, buf); |
| |
| InitializeConfigCluster(); |
| |
| for (size_t j=0; j<(sizeof(agElapsed_)/sizeof(int)); ++j) |
| { |
| agElapsed_[j] = 0; |
| } |
| |
| char *p = getenv("MON_MIN_RECV_COUNT"); |
| if ( p ) |
| { |
| long int val = strtoul(p, NULL, 10); |
| if (errno != ERANGE) |
| { |
| minRecvCount_ = val; |
| } |
| } |
| |
| p = getenv("SQ_MON_NODE_DOWN_DEATH_MESSAGES"); |
| if ( p && atoi(p) == 0) |
| { |
| nodeDownDeathNotices_ = false; |
| } |
| |
| // build the node objects & Sync collision assignment arrays |
| // these buffers will be used in ShareWithPeers in AllGather |
| // operation to get TMSync data as well as Replication data. |
| // Allocate the maximum allowed so that we pay the price only once. |
| // This wastes a bit of memory but reduces complexity when |
| // adding and deleting nodes. Usage is based on GetConfigPNodesMax() |
| // the maximum number that can be configured. |
| recvBuffer_ = new struct sync_buffer_def[GetConfigPNodesMax()]; |
| recvBuffer2_ = new struct sync_buffer_def[GetConfigPNodesMax()]; |
| |
| TRACE_EXIT; |
| } |
| |
| CCluster::~CCluster (void) |
| { |
| const char method_name[] = "CCluster::~CCluster"; |
| TRACE_ENTRY; |
| |
| if (epollFD_ != -1) |
| { |
| close( epollFD_ ); |
| } |
| |
| if (commSock_ != -1) |
| { |
| close( commSock_ ); |
| } |
| |
| if (syncSock_ != -1) |
| { |
| close( syncSock_ ); |
| } |
| |
| delete [] comms_; |
| delete [] otherMonRank_; |
| delete [] socks_; |
| delete [] sockPorts_; |
| if (nodeMap_) |
| { |
| delete [] nodeMap_; |
| nodeMap_ = NULL; |
| } |
| |
| delete [] recvBuffer2_; |
| delete [] recvBuffer_; |
| |
| TRACE_EXIT; |
| } |
| |
| int CCluster::incrGetVerifierNum() |
| { |
| verifierNum_++; |
| if ( verifierNum_ < 0 ) |
| { |
| verifierNum_ = 0; |
| } |
| |
| return verifierNum_; |
| } |
| |
| // For a reintegrated monitor node, following the first sync cycle, obtain the |
| // current sync cycle sequence number. And verify that all nodes agree |
| // on the sequence number. |
| unsigned long long CCluster::EnsureAndGetSeqNum(cluster_state_def_t nodestate[]) |
| { |
| const char method_name[] = "CCluster::EnsureAndGetSeqNum"; |
| TRACE_ENTRY; |
| |
| unsigned long long seqNum = 0; |
| |
| for (int i = 0; i < GetConfigPNodesCount(); i++) |
| { |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum ); |
| } |
| if (nodestate[indexToPnid_[i]].seq_num > 1) |
| { |
| if (seqNum == 0) |
| { |
| seqNum = nodestate[indexToPnid_[i]].seq_num; |
| } |
| else |
| { |
| assert(nodestate[indexToPnid_[i]].seq_num == seqNum); |
| } |
| } |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum ); |
| } |
| } |
| |
| TRACE_EXIT; |
| return seqNum; |
| } |
| |
| |
| void CCluster::HardNodeDown (int pnid, bool communicate_state) |
| { |
| #ifndef NAMESERVER_PROCESS |
| char port_fname[MAX_PROCESS_PATH]; |
| char temp_fname[MAX_PROCESS_PATH]; |
| #endif |
| CNode *node; |
| CLNode *lnode; |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CCluster::HardNodeDown"; |
| TRACE_ENTRY; |
| |
| node = Nodes->GetNode(pnid); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - pnid=%d, comm_state=%d, state=%s, isInQuiesceState=%d," |
| " (local pnid=%d, state=%s, isInQuiesceState=%d, " |
| "shutdown level=%d)\n", method_name, __LINE__, |
| pnid, communicate_state, StateString(node->GetState()), |
| node->isInQuiesceState(), |
| MyPNID, StateString(MyNode->GetState()), |
| MyNode->isInQuiesceState(), MyNode->GetShutdownLevel() ); |
| |
| if (( MyPNID == pnid ) && |
| ( MyNode->GetState() == State_Down || |
| MyNode->IsKillingNode() ) ) |
| { |
| // we are coming down ... don't process it |
| if ( !IsRealCluster && MyNode->isInQuiesceState()) |
| { |
| // in virtual env, this would be called after node quiescing, |
| // so continue with mark down processing. |
| } |
| else |
| { |
| return; |
| } |
| } |
| |
| if ( (MyNode->GetShutdownLevel() != ShutdownLevel_Undefined) && |
| (pnid != MyPNID) ) // some other node went down while shutdown was in progress |
| { |
| snprintf(buf, sizeof(buf), "[%s], Node failure during shutdown, down nid = %d\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_MARKDOWN_1, SQ_LOG_ERR, buf); |
| |
| if (!waitForWatchdogExit_) // if WDT is not exiting |
| { |
| // bring down this node because TSE backup processes may not exit |
| // if the primary was on the node that went down. |
| ReqQueue.enqueueDownReq(MyPNID); |
| } |
| } |
| |
| if ( communicate_state && pnid != MyPNID ) |
| { |
| // just communicate the change and let the real node handle it. |
| node->SetChangeState( true ); |
| return; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( !Emulate_Down ) |
| { |
| if( !IsRealCluster ) |
| { |
| snprintf(port_fname, sizeof(port_fname), "%s/monitor.%d.port.%s",getenv("MPI_TMPDIR"),pnid,node->GetName()); |
| } |
| else |
| { |
| // Remove the domain portion of the name if any |
| char short_node_name[MPI_MAX_PROCESSOR_NAME]; |
| char str1[MPI_MAX_PROCESSOR_NAME]; |
| memset( short_node_name, 0, MPI_MAX_PROCESSOR_NAME ); |
| memset( str1, 0, MPI_MAX_PROCESSOR_NAME ); |
| strcpy (str1, node->GetName() ); |
| |
| char *str1_dot = strchr( (char *) str1, '.' ); |
| if ( str1_dot ) |
| { |
| memcpy( short_node_name, str1, str1_dot - str1 ); |
| } |
| else |
| { |
| strcpy (short_node_name, str1 ); |
| } |
| snprintf(port_fname, sizeof(port_fname), "%s/monitor.port.%s",getenv("MPI_TMPDIR"),short_node_name); |
| } |
| sprintf(temp_fname, "%s.bak", port_fname); |
| remove(temp_fname); |
| rename(port_fname, temp_fname); |
| } |
| #endif |
| |
| if (node->GetState() != State_Down || !node->isInQuiesceState()) |
| { |
| snprintf(buf, sizeof(buf), |
| "[CCluster::HardNodeDown], Node %s (%d) is going down.\n", |
| node->GetName(), node->GetPNid()); |
| mon_log_write(MON_CLUSTER_MARKDOWN_2, SQ_LOG_CRIT, buf); |
| |
| node->SetKillingNode( true ); |
| |
| if ( MyPNID == pnid && |
| (MyNode->GetState() == State_Up || MyNode->GetState() == State_Shutdown) && |
| !MyNode->isInQuiesceState() ) |
| { |
| STATE state = MyNode->GetState(); |
| switch ( state ) |
| { |
| case State_Up: |
| case State_Shutdown: |
| // do node quiescing and let HealthCheck thread know that quiescing has started |
| // setting internal state to 'quiesce' will prevent replicating process exits |
| // and reject normal shutdown requests in all nodes while we are quiescing. |
| if (!waitForWatchdogExit_) // if WDT is not exiting |
| { |
| MyNode->setQuiesceState(); |
| HealthCheck.setState(MON_NODE_QUIESCE); |
| } |
| break; |
| default: // in all other states |
| if ( ! Emulate_Down ) |
| { |
| // make sure no processes are alive if in the middle of re-integration |
| #ifndef NAMESERVER_PROCESS |
| node->KillAllDown(); |
| #endif |
| snprintf(buf, sizeof(buf), |
| "[CCluster::HardNodeDown], Node %s (%d)is down.\n", |
| node->GetName(), node->GetPNid()); |
| mon_log_write(MON_CLUSTER_MARKDOWN_3, SQ_LOG_ERR, buf); |
| // Don't generate a core file, abort is intentional |
| struct rlimit limit; |
| limit.rlim_cur = 0; |
| limit.rlim_max = 0; |
| setrlimit(RLIMIT_CORE, &limit); |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| } |
| } |
| else |
| { |
| if ( node->GetPNid() == integratingPNid_ ) |
| { |
| ResetIntegratingPNid(); |
| } |
| #ifndef NAMESERVER_PROCESS |
| node->KillAllDown(); |
| #endif |
| node->SetState( State_Down ); |
| // Send node down message to local node's processes |
| lnode = node->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->Down(); |
| } |
| if ( ZClientEnabled ) |
| { |
| ZClient->WatchNodeDelete( node->GetName() ); |
| ZClient->WatchNodeMasterDelete( node->GetName() ); |
| } |
| } |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| // we need to abort any active TmSync |
| if (( MyNode->GetTmSyncState() == SyncState_Start ) || |
| ( MyNode->GetTmSyncState() == SyncState_Continue ) || |
| ( MyNode->GetTmSyncState() == SyncState_Commit ) ) |
| { |
| MyNode->SetTmSyncState( SyncState_Abort ); |
| Monitor->SetAbortPendingTmSync(); |
| if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() )); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( Emulate_Down ) |
| { |
| AssignTmLeader(pnid, false); |
| } |
| else |
| #endif |
| { |
| AssignLeaders(pnid, node->GetName(), false); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::SoftNodeDown( int pnid ) |
| { |
| CNode *node; |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CCluster::SoftNodeDown"; |
| TRACE_ENTRY; |
| |
| node = Nodes->GetNode(pnid); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d" |
| " (local pnid=%d, state=%s, phase=%s, isInQuiesceState=%d, isSoftNodeDown=%d " |
| "shutdown level=%d)\n" |
| , method_name, __LINE__ |
| , pnid, StateString(node->GetState()) |
| , NodePhaseString(node->GetPhase()) |
| , node->isInQuiesceState() |
| , node->IsSoftNodeDown() |
| , MyPNID, StateString(MyNode->GetState()) |
| , NodePhaseString(MyNode->GetPhase()) |
| , MyNode->isInQuiesceState() |
| , MyNode->IsSoftNodeDown() |
| , MyNode->GetShutdownLevel() ); |
| } |
| |
| if (( MyPNID == pnid ) && |
| ( MyNode->GetState() == State_Down || |
| MyNode->IsKillingNode() ) ) |
| { |
| // we are coming down ... don't process it |
| return; |
| } |
| |
| snprintf( buf, sizeof(buf) |
| , "[%s], Node %s (%d) is going soft down.\n" |
| , method_name, node->GetName(), node->GetPNid()); |
| mon_log_write(MON_CLUSTER_SOFTNODEDOWN_1, SQ_LOG_ERR, buf); |
| |
| node->SetKillingNode( true ); |
| |
| if ( node->GetState() == State_Up ) |
| { |
| node->SetSoftNodeDown(); // Set soft down flag |
| node->SetPhase( Phase_SoftDown ); // Suspend TMSync on node |
| |
| if ( node->GetPNid() == MyPNID ) |
| { |
| // and tell remote monitor processes the node is soft down |
| CReplSoftNodeDown *repl = new CReplSoftNodeDown( MyPNID ); |
| Replicator.addItem(repl); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| node->KillAllDownSoft(); // Kill all processes |
| #endif |
| |
| snprintf( buf, sizeof(buf) |
| , "[%s], Node %s (%d) executed soft down.\n" |
| , method_name, node->GetName(), node->GetPNid() ); |
| mon_log_write(MON_CLUSTER_SOFTNODEDOWN_2, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| snprintf( buf, sizeof(buf), |
| "[%s], Node %s (%d) soft node down not executed, state=%s\n" |
| , method_name, node->GetName() |
| , node->GetPNid() |
| , StateString(MyNode->GetState()) ); |
| mon_log_write(MON_CLUSTER_SOFTNODEDOWN_3, SQ_LOG_ERR, buf); |
| // Probably a programmer bonehead! |
| abort(); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| // we need to abort any active TmSync |
| if (( MyNode->GetTmSyncState() == SyncState_Start ) || |
| ( MyNode->GetTmSyncState() == SyncState_Continue ) || |
| ( MyNode->GetTmSyncState() == SyncState_Commit ) ) |
| { |
| MyNode->SetTmSyncState( SyncState_Abort ); |
| Monitor->SetAbortPendingTmSync(); |
| if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() )); |
| } |
| #endif |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , NodePhaseString(node->GetPhase()) |
| , node->IsSoftNodeDown()); |
| } |
| |
| AssignLeaders(pnid, node->GetName(), false); |
| |
| TRACE_EXIT; |
| } |
| |
| bool CCluster::CheckSpareSet( int pnid ) |
| { |
| bool activatedSpare = false; |
| bool done = false; |
| unsigned int ii; |
| unsigned int jj; |
| CNode *newNode = Nodes->GetNode( pnid ); |
| |
| const char method_name[] = "CCluster::CheckSpareSet"; |
| TRACE_ENTRY; |
| |
| // Build spare node set |
| CNode *spareNode; |
| NodesList spareSetList; |
| NodesList *spareNodesConfigList = Nodes->GetSpareNodesConfigList(); |
| NodesList::iterator itSn; |
| for ( itSn = spareNodesConfigList->begin(); |
| itSn != spareNodesConfigList->end() && !done ; itSn++ ) |
| { |
| spareNode = *itSn; |
| PNidVector sparePNids = spareNode->GetSparePNids(); |
| // if the new node is a spare node in the configuration |
| if ( newNode->GetPNid() == spareNode->GetPNid() ) |
| { |
| // Add the spare node and each node it is configured to spare to the set |
| spareSetList.push_back( spareNode ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - pnid=%d, name=(%s) is a configured Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName()); |
| |
| for ( ii = 0; ii < sparePNids.size(); ii++ ) |
| { |
| spareSetList.push_back( Nodes->GetNode(sparePNids[ii]) ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - pnid=%d, name=(%s) is in Spare set\n", method_name, __LINE__, Nodes->GetNode(sparePNids[ii])->GetPNid(), Nodes->GetNode(sparePNids[ii])->GetName()); |
| } |
| done = true; |
| } |
| else |
| { |
| // Check each pnid it is configured to spare |
| for ( jj = 0; jj < sparePNids.size(); jj++ ) |
| { |
| // if the new node is in the spare set of a spare node |
| if ( newNode->GetPNid() == sparePNids[jj] ) |
| { |
| // Add the spare node and each node it is configured to spare to the set |
| spareSetList.push_back( spareNode ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - pnid=%d, name=(%s) is a configured Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName()); |
| |
| for ( ii = 0; ii < sparePNids.size(); ii++ ) |
| { |
| spareSetList.push_back( Nodes->GetNode(sparePNids[ii]) ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - pnid=%d, name=(%s) is in Spare set\n", method_name, __LINE__, Nodes->GetNode(sparePNids[ii])->GetPNid(), Nodes->GetNode(sparePNids[ii])->GetName()); |
| } |
| done = true; |
| } |
| } |
| } |
| } |
| |
| if (newNode && trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - new node pnid=%d, name=(%s), zid=%d\n" |
| , method_name, __LINE__ |
| , newNode->GetPNid(), newNode->GetName(), newNode->GetZone()); |
| } |
| |
| // if the newNode still owns the zone |
| if ( newNode && newNode->GetZone() != -1 ) |
| { |
| // assume implicit spare node activation |
| // (no need to move logical nodes to physical node) |
| // since HardNodeUp() already set State_Up, |
| // just reset spare node flag and remove from available spare nodes |
| newNode->ResetSpareNode(); |
| Nodes->RemoveFromSpareNodesList( newNode ); |
| ActivateSpare( newNode, newNode ); |
| activatedSpare = true; |
| TRACE_EXIT; |
| return( activatedSpare ); |
| } |
| |
| CLNode *lnode; |
| CNode *node; |
| CNode *downNode = NULL; |
| |
| // Now check the state of each configured logical node in the set for down state |
| spareNode = newNode; // new node (pnid) is the spare to activate |
| NodesList::iterator itSs; |
| for ( itSs = spareSetList.begin(); itSs != spareSetList.end(); itSs++ ) |
| { |
| node = *itSs; |
| if ( node->GetPNid() != pnid ) |
| { |
| // Find the first down node |
| if ( !downNode ) |
| { |
| lnode = node->GetFirstLNode(); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - node nid=%d, pnid=%d(%s), state=%s\n" |
| , method_name, __LINE__, lnode?lnode->GetNid():-1 |
| , node->GetPNid(), node->GetName() |
| , StateString( node->GetState() ) ); |
| if ( lnode && lnode->GetState() == State_Down ) |
| { |
| downNode = node; |
| } |
| } |
| } |
| if ( spareNode && downNode ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - spare node pnid=%d (%s), down node pnid=%d (%s) \n" |
| , method_name, __LINE__ |
| , spareNode->GetPNid(), spareNode->GetName() |
| , downNode->GetPNid(), downNode->GetName()); |
| break; |
| } |
| } |
| |
| if ( spareNode && downNode ) |
| { |
| Nodes->RemoveFromSpareNodesList( spareNode ); |
| spareNode->ResetSpareNode(); |
| if ( downNode->GetPNid() != pnid ) |
| { // the spare node does not own the down logical nodes so activate it |
| ActivateSpare( spareNode, downNode ); |
| } |
| activatedSpare = true; |
| } |
| |
| TRACE_EXIT; |
| return( activatedSpare ); |
| } |
| |
| const char *JoiningPhaseString( JOINING_PHASE phase ) |
| { |
| const char *str; |
| |
| switch( phase ) |
| { |
| case JoiningPhase_Unknown: |
| str = "JoiningPhase_Unknown"; |
| break; |
| case JoiningPhase_1: |
| str = "JoiningPhase_1"; |
| break; |
| case JoiningPhase_2: |
| str = "JoiningPhase_2"; |
| break; |
| case JoiningPhase_3: |
| str = "JoiningPhase_3"; |
| break; |
| default: |
| str = "JoiningPhase - Undefined"; |
| break; |
| } |
| |
| return( str ); |
| } |
| |
| struct message_def *CCluster::JoinMessage( const char *node_name, int pnid, JOINING_PHASE phase ) |
| { |
| struct message_def *msg; |
| |
| const char method_name[] = "CCluster::JoinMessage"; |
| TRACE_ENTRY; |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->notice_death_Incr(); |
| |
| msg = new struct message_def; |
| msg->type = MsgType_NodeJoining; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| strcpy( msg->u.request.u.joining.node_name, node_name ); |
| msg->u.request.u.joining.pnid = pnid; |
| msg->u.request.u.joining.phase = phase; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Joining notice for node %s (pnid=%d, phase=%d)\n", |
| method_name, __LINE__, node_name, pnid, phase ); |
| TRACE_EXIT; |
| |
| return msg; |
| } |
| |
| struct message_def *CCluster::SpareUpMessage( const char *node_name, int pnid ) |
| { |
| struct message_def *msg; |
| |
| const char method_name[] = "CCluster::SpareUpMessage"; |
| TRACE_ENTRY; |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->notice_death_Incr(); |
| |
| msg = new struct message_def; |
| msg->type = MsgType_SpareUp; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| strcpy( msg->u.request.u.spare_up.node_name, node_name ); |
| msg->u.request.u.spare_up.pnid = pnid; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Spare node up notice for node %s nid=%d\n", |
| method_name, __LINE__, node_name, pnid ); |
| TRACE_EXIT; |
| |
| return msg; |
| } |
| |
| struct message_def *CCluster::ReIntegErrorMessage( const char *msgText ) |
| { |
| struct message_def *msg; |
| |
| const char method_name[] = "CCluster::ReIntegErrorMessage"; |
| TRACE_ENTRY; |
| |
| msg = new struct message_def; |
| msg->type = MsgType_ReintegrationError; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| strncpy( msg->u.request.u.reintegrate.msg, msgText, |
| sizeof(msg->u.request.u.reintegrate.msg) ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Reintegrate notice %s\n", |
| method_name, __LINE__, msgText ); |
| |
| TRACE_EXIT; |
| |
| return msg; |
| } |
| |
| int CCluster::HardNodeUp( int pnid, char *node_name ) |
| { |
| bool spareNodeActivated = false; |
| int rc = MPI_SUCCESS; |
| int tmCount = 0; |
| CNode *node; |
| CLNode *lnode; |
| STATE nodeState; |
| |
| const char method_name[] = "CCluster::HardNodeUp"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - pnid=%d, name=%s (MyPNID = %d), currentNodes_=%d\n" |
| , method_name, __LINE__, pnid, node_name, MyPNID, currentNodes_ ); |
| |
| if ( pnid == -1 ) |
| { |
| node = Nodes->GetNode( node_name ); |
| } |
| else |
| { |
| node = Nodes->GetNode( pnid ); |
| } |
| |
| if ( node == NULL ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Invalid node, pnid=%d, name=%s" "\n" |
| , method_name, __LINE__, pnid, node_name ); |
| |
| return( MPI_ERR_NAME ); |
| } |
| |
| nodeState = node->GetState(); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Node state=%s" "\n" |
| , method_name, __LINE__, StateString( nodeState ) ); |
| if ( nodeState != State_Up ) |
| { |
| if ( nodeState == State_Down ) |
| { |
| node->SetKillingNode( false ); |
| if ( Emulate_Down ) |
| { |
| // Any DTMs running? |
| for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ ) |
| { |
| CNode *tempNode = Nodes->GetNodeByMap( i ); |
| lnode = tempNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *process = lnode->GetProcessLByType( ProcessType_DTM ); |
| if ( process ) tmCount++; |
| } |
| } |
| if ( tmCount ) |
| { |
| IAmIntegrated = true; |
| } |
| // We need to remove any old process objects before we restart the node. |
| node->CleanUpProcesses(); |
| node->SetState( State_Up ); |
| if ( MyPNID == pnid ) |
| { |
| MyNode->clearQuiesceState(); |
| HealthCheck.initializeVars(); |
| SMSIntegrating = true; |
| #ifndef NAMESERVER_PROCESS |
| Monitor->StartPrimitiveProcesses(); |
| #endif |
| // Let other monitors know this node is up |
| CReplNodeUp *repl = new CReplNodeUp(MyPNID); |
| Replicator.addItem(repl); |
| } |
| else |
| { |
| if ( tmCount ) |
| { |
| #ifndef NAMESERVER_PROCESS |
| // Send node prepare notice to local DTM processes |
| lnode = node->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->PrepareForTransactions( true ); |
| } |
| #endif |
| } |
| else |
| { |
| // Process logical node up |
| lnode = node->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->Up(); |
| } |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - Unexpectedly executing HardNodeUp. Expecting to do accept in commAccept thread\n", |
| method_name, __LINE__ ); |
| |
| } |
| } |
| else if ( nodeState == State_Merged ) |
| { |
| node->SetKillingNode( false ); |
| node->SetState( State_Joining ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d" " - New monitor %s, pnid=%d, state=%s" "\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid(), StateString( node->GetState() ) ); |
| for ( int i =0; i < Nodes->GetPNodesCount(); i++ ) |
| { |
| trace_printf( "%s@%d socks_[indexToPnid_[%d]=%d]=%d, sockPorts_[indexToPnid_[%d]=%d]=%d\n" |
| , method_name, __LINE__ |
| , i, indexToPnid_[i], socks_[indexToPnid_[i]] |
| , i, indexToPnid_[i], sockPorts_[indexToPnid_[i]] ); |
| } |
| } |
| if ( MyNode->IsCreator() ) |
| { |
| #ifndef NAMESERVER_PROCESS |
| SQ_theLocalIOToClient->putOnNoticeQueue( MyNode->GetCreatorPid() |
| , MyNode->GetCreatorVerifier() |
| , JoinMessage( node->GetName() |
| , node->GetPNid() |
| , JoiningPhase_1 ) |
| , NULL); |
| #endif |
| |
| // save the current seq num in the snapshot request. |
| // this sequence number will match the state of the cluster |
| // when this request is processed. |
| ReqQueue.enqueueSnapshotReq(seqNum_); |
| } |
| if ( MyPNID == pnid ) |
| { |
| // request and process revive packet from the creator. |
| // when complete, this will call HardNodeUp again. |
| ReqQueue.enqueueReviveReq( ); |
| } |
| else |
| { |
| if ( ZClientEnabled ) |
| { |
| rc = ZClient->WatchNode( node->GetName() ); |
| if ( rc != ZOK ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Unable to set node watch on %s, pnid%d\n" |
| , method_name, node->GetName(), node->GetPNid() ); |
| mon_log_write(MON_CLUSTER_HARDNODEUP_1, SQ_LOG_ERR, buf); |
| } |
| } |
| } |
| } |
| else if ( nodeState == State_Joining ) |
| { |
| // The new monitor comes in here first and schedules a node up request on all nodes. |
| // All other monitors come here next, including the creator. |
| // The new monitor will not come here again because |
| // CReplNodeUp is a noop for the one who schedules it. |
| node->SetState( State_Up ); |
| |
| if ( Nodes->GetSNodesCount() == 0 ) |
| { // Spare nodes not configured so bring up my logical nodes |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - No spare nodes configured node=%s, pnid=%d, state=%s\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() |
| , StateString(node->GetState()) ); |
| if ( MyPNID == pnid ) |
| { |
| ActivateSpare( node, node ); |
| } |
| } |
| else |
| { |
| node->SetSpareNode(); |
| Nodes->AddToSpareNodesList( node->GetPNid() ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Adding to available spares node=%s, pnid=%d\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() ); |
| // Check for a node down in spare set and activate down node if found |
| spareNodeActivated = CheckSpareSet( node->GetPNid() ); |
| if ( spareNodeActivated ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Activated spare node=%s, pnid=%d\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() ); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Available spare node=%s, pnid=%d\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() ); |
| |
| // Spare node not activated |
| if ( MyNode->IsCreator() ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Sending spare up notice to creator shell(%d) spare node=%s, pnid=%d\n" |
| , method_name, __LINE__, MyNode->GetCreatorPid(), node->GetName(), node->GetPNid() ); |
| #ifndef NAMESERVER_PROCESS |
| // Tell creator spare node is up |
| SQ_theLocalIOToClient->putOnNoticeQueue( MyNode->GetCreatorPid() |
| , MyNode->GetCreatorVerifier() |
| , SpareUpMessage( node->GetName() |
| , node->GetPNid() ) |
| , NULL); |
| #endif |
| } |
| } |
| } |
| |
| if ( MyPNID == pnid ) |
| { |
| // Any DTMs running? |
| for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ ) |
| { |
| CNode *tempNode = Nodes->GetNodeByMap( i ); |
| lnode = tempNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *process = lnode->GetProcessLByType( ProcessType_DTM ); |
| if ( process ) tmCount++; |
| } |
| } |
| if ( !tmCount && !spareNodeActivated ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - Replicating node up %s, pnid=%d, state=%s, spare=%d, DTM count=%d\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() |
| , StateString(node->GetState()), node->IsSpareNode(), tmCount ); |
| // Let other monitors know this node is up |
| CReplNodeUp *repl = new CReplNodeUp(MyPNID); |
| Replicator.addItem(repl); |
| } |
| } |
| |
| ResetIntegratingPNid(); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d" " - New monitor %s, pnid=%d, state=%s, spare=%d\n" |
| , method_name, __LINE__, node->GetName(), node->GetPNid() |
| , StateString(node->GetState()), node->IsSpareNode() ); |
| } |
| } |
| |
| TRACE_EXIT; |
| return( rc ); |
| } |
| |
| int CCluster::SoftNodeUpPrepare( int pnid ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| int rc = MPI_SUCCESS; |
| int tmCount = 0; |
| CNode *node; |
| CLNode *lnode; |
| STATE nodeState; |
| |
| const char method_name[] = "CCluster::SoftNodeUpPrepare"; |
| TRACE_ENTRY; |
| |
| node = Nodes->GetNode( pnid ); |
| if ( node == NULL ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - Invalid node, pnid=%d\n" |
| , method_name, __LINE__, pnid ); |
| |
| return( MPI_ERR_NAME ); |
| } |
| |
| nodeState = node->GetState(); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - Node name=%s, pnid=%d, state=%s, soft down=%d\n" |
| , method_name, __LINE__ |
| , node->GetName() |
| , node->GetPNid() |
| , StateString( nodeState ) |
| , node->IsSoftNodeDown() ); |
| |
| if ( nodeState != State_Up ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - Unexpectedly executing SoftNodeUp\n", |
| method_name, __LINE__ ); |
| // Programmer bonehead! |
| abort(); |
| } |
| |
| node->SetKillingNode( false ); |
| |
| node->ResetSoftNodeDown( ); |
| |
| node->SetPhase( Phase_Ready ); |
| |
| if ( MyPNID == pnid ) |
| { |
| SMSIntegrating = true; |
| #ifndef NAMESERVER_PROCESS |
| node->SetSoftNodeUp( ); |
| Monitor->StartPrimitiveProcesses(); |
| #endif |
| // Let other monitors know this node is preparing to soft up |
| CReplSoftNodeUp *repl = new CReplSoftNodeUp(MyPNID); |
| Replicator.addItem(repl); |
| } |
| else |
| { |
| // Any DTMs running? |
| for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ ) |
| { |
| CNode *tempNode = Nodes->GetNodeByMap( i ); |
| lnode = tempNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *process = lnode->GetProcessLByType( ProcessType_DTM ); |
| if ( process ) tmCount++; |
| } |
| } |
| if ( tmCount ) |
| { |
| #ifndef NAMESERVER_PROCESS |
| // Send DTM restarted notice to local DTM processes |
| lnode = node->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->SendDTMRestarted(); |
| } |
| #endif |
| } |
| else |
| { |
| snprintf( buf, sizeof(buf), |
| "[%s], Node %s (%d) soft node up prepare not executed, state=%s, tmCount=%d\n" |
| , method_name, node->GetName() |
| , node->GetPNid() |
| , StateString(MyNode->GetState()) |
| , tmCount ); |
| mon_log_write(MON_CLUSTER_SOFTNODEUP_1, SQ_LOG_WARNING, buf); |
| } |
| } |
| |
| TRACE_EXIT; |
| return( rc ); |
| } |
| |
| |
| |
| |
| |
| const char *StateString( STATE state) |
| { |
| const char *str; |
| |
| switch( state ) |
| { |
| case State_Unknown: |
| str = "State_Unknown"; |
| break; |
| case State_Up: |
| str = "State_Up"; |
| break; |
| case State_Down: |
| str = "State_Down"; |
| break; |
| case State_Stopped: |
| str = "State_Stopped"; |
| break; |
| case State_Shutdown: |
| str = "State_Shutdown"; |
| break; |
| case State_Unlinked: |
| str = "State_Unlinked"; |
| break; |
| case State_Merging: |
| str = "State_Merging"; |
| break; |
| case State_Merged: |
| str = "State_Merged"; |
| break; |
| case State_Joining: |
| str = "State_Joining"; |
| break; |
| case State_Initializing: |
| str = "State_Initializing"; |
| break; |
| default: |
| str = "State - Undefined"; |
| break; |
| } |
| |
| return( str ); |
| } |
| |
| const char *SyncStateString( SyncState state) |
| { |
| const char *str; |
| |
| switch( state ) |
| { |
| case SyncState_Null: |
| str = "SyncState_Null"; |
| break; |
| case SyncState_Start: |
| str = "SyncState_Start"; |
| break; |
| case SyncState_Continue: |
| str = "SyncState_Continue"; |
| break; |
| case SyncState_Abort: |
| str = "SyncState_Abort"; |
| break; |
| case SyncState_Commit: |
| str = "SyncState_Commit"; |
| break; |
| case SyncState_Suspended: |
| str = "SyncState_Suspended"; |
| break; |
| default: |
| str = "SyncState - Undefined"; |
| break; |
| } |
| |
| return( str ); |
| } |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| void CCluster::AddTmsyncMsg( struct sync_buffer_def *tmSyncBuffer |
| , struct sync_def *sync |
| , struct internal_msg_def *msg) |
| { |
| const char method_name[] = "CCluster::AddTmsyncMsg"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Requesting SyncType=%d\n", method_name, |
| __LINE__, sync->type); |
| |
| msg->type = InternalType_Sync; |
| msg->u.sync.type = sync->type; |
| msg->u.sync.pnid = sync->pnid; |
| msg->u.sync.syncnid = sync->syncnid; |
| msg->u.sync.tmleader = sync->tmleader; |
| msg->u.sync.state = sync->state; |
| msg->u.sync.count = sync->count; |
| if ( sync->type == SyncType_TmData ) |
| { |
| memmove (msg->u.sync.data, sync->data, sync->length); |
| } |
| msg->u.sync.length = sync->length; |
| |
| // We can have only have a single "InternalType_Sync" msg in our |
| // SyncBuffer, else we cause a collision. |
| |
| int msgSize = (MSG_HDR_SIZE + sizeof(sync_def) - MAX_SYNC_DATA |
| + sync->length ); |
| |
| // Insert the message size into the message header |
| msg->replSize = msgSize; |
| tmSyncBuffer->msgInfo.msg_count = 1; |
| tmSyncBuffer->msgInfo.msg_offset += msgSize; |
| |
| // Set end-of-buffer marker |
| msg = (struct internal_msg_def *) |
| &tmSyncBuffer->msg[tmSyncBuffer->msgInfo.msg_offset]; |
| msg->type = InternalType_Null; |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CCluster::DoDeviceReq(char * ldevName) |
| { |
| const char method_name[] = "CCluster::DoDeviceReq"; |
| TRACE_ENTRY; |
| |
| CProcess *process; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal device request for ldev %s\n", |
| method_name, __LINE__, ldevName); |
| Nodes->GetLNode(ldevName, &process); |
| if (!process) |
| { |
| if (trace_settings & TRACE_SYNC) |
| trace_printf("%s@%d - Device processing but can't find device %s\n", |
| method_name, __LINE__, ldevName); |
| } |
| else |
| { |
| CLogicalDevice *ldev; |
| ldev = Devices->GetLogicalDevice( ldevName ); |
| if ( !ldev ) |
| { // The device name is not known on this node |
| // we need to clone the device |
| ldev = Devices->CloneDevice( process ); |
| } |
| if ( ldev ) |
| { |
| bool rstate = false; |
| if ( ldev->Mounted() ) |
| { |
| rstate = ldev->UnMount( false ); |
| if (!rstate) |
| { |
| if (trace_settings & TRACE_REQUEST) |
| trace_printf("%s@%d - Can't unmount device %s for " |
| "process %s (%d, %d)\n", method_name, |
| __LINE__, ldev->name(), process->GetName(), |
| process->GetNid(), process->GetPid()); |
| } |
| } |
| if ( rstate ) |
| { |
| rstate = ldev->Mount( process, false ); |
| if (!rstate) |
| { |
| if (trace_settings & TRACE_REQUEST) |
| trace_printf("%s@%d - Can't mount device %s for " |
| "process %s (%d, %d)\n", method_name, |
| __LINE__, ldev->name(), process->GetName(), |
| process->GetNid(), process->GetPid()); |
| } |
| else |
| { |
| if (trace_settings & TRACE_REQUEST) |
| trace_printf("%s@%d - Mounted device %s for process " |
| "%s (%d, %d)\n", method_name, __LINE__, |
| ldev->name(), process->GetName(), |
| process->GetNid(), process->GetPid()); |
| } |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find ldev %s.\n", method_name, |
| ldevName); |
| mon_log_write(MON_CLUSTER_DODEVICEREQ_1, SQ_LOG_ERR, buf); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifdef EXCHANGE_CPU_SCHEDULING_DATA |
| void CCluster::SaveSchedData( struct internal_msg_def *recv_msg ) |
| { |
| const char method_name[] = "CCluster::SaveSchedData"; |
| TRACE_ENTRY; |
| |
| int nid = recv_msg->u.scheddata.PNid; |
| Node[nid]->SetNumCores( recv_msg->u.scheddata.processors ); |
| Node[nid]->SetFreeMemory( recv_msg->u.scheddata.memory_free ); |
| Node[nid]->SetFreeSwap( recv_msg->u.scheddata.swap_free ); |
| Node[nid]->SetFreeCache( recv_msg->u.scheddata.cache_free ); |
| Node[nid]->SetMemTotal( recv_msg->u.scheddata.memory_total ); |
| Node[nid]->SetMemActive( recv_msg->u.scheddata.memory_active ); |
| Node[nid]->SetMemInactive( recv_msg->u.scheddata.memory_inactive ); |
| Node[nid]->SetMemDirty( recv_msg->u.scheddata.memory_dirty ); |
| Node[nid]->SetMemWriteback( recv_msg->u.scheddata.memory_writeback ); |
| Node[nid]->SetMemVMallocUsed( recv_msg->u.scheddata.memory_VMallocUsed ); |
| Node[nid]->SetBTime( recv_msg->u.scheddata.btime ); |
| |
| CLNode *lnode; |
| lnode = Node[nid]->GetFirstLNode(); |
| int i = 0; |
| |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| lnode->SetCpuUser(recv_msg->u.scheddata.proc_stats[i].cpu_user); |
| lnode->SetCpuNice(recv_msg->u.scheddata.proc_stats[i].cpu_nice); |
| lnode->SetCpuSystem(recv_msg->u.scheddata.proc_stats[i].cpu_system); |
| lnode->SetCpuIdle(recv_msg->u.scheddata.proc_stats[i].cpu_idle); |
| lnode->SetCpuIowait(recv_msg->u.scheddata.proc_stats[i].cpu_iowait); |
| lnode->SetCpuIrq(recv_msg->u.scheddata.proc_stats[i].cpu_irq); |
| lnode->SetCpuSoftIrq(recv_msg->u.scheddata.proc_stats[i].cpu_soft_irq); |
| |
| ++i; |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg, |
| int pnid) |
| { |
| const char method_name[] = "CCluster::HandleOtherNodeMsg"; |
| TRACE_ENTRY; |
| |
| CNode *downNode; |
| CNode *spareNode; |
| #ifndef NAMESERVER_PROCESS |
| CProcess *process; |
| CLNode *lnode; |
| #endif |
| |
| switch (recv_msg->type) |
| { |
| case InternalType_Null: |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| trace_printf("%s@%d - Node n%d has nothing to " |
| "update. \n", method_name, __LINE__, pnid); |
| break; |
| |
| case InternalType_ActivateSpare: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal activate spare request, spare pnid=%d, down pnid=%d\n" |
| , method_name, __LINE__ |
| , recv_msg->u.activate_spare.spare_pnid |
| , recv_msg->u.activate_spare.down_pnid); |
| |
| downNode = NULL; |
| if ( recv_msg->u.activate_spare.down_pnid != -1 ) |
| { |
| downNode = Nodes->GetNode( recv_msg->u.activate_spare.down_pnid ); |
| } |
| spareNode = Nodes->GetNode( recv_msg->u.activate_spare.spare_pnid ); |
| ReqQueue.enqueueActivateSpareReq( spareNode, downNode ); |
| break; |
| |
| case InternalType_NameServerAdd: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal NameServer add request for node_name=%s\n" |
| , method_name, __LINE__ |
| , recv_msg->u.nameserver_add.node_name ); |
| |
| // Queue the nameserver add request for processing by a worker thread. |
| ReqQueue.enqueueNameServerAddReq( recv_msg->u.nameserver_add.req_nid |
| , recv_msg->u.nameserver_add.req_pid |
| , recv_msg->u.nameserver_add.req_verifier |
| , recv_msg->u.nameserver_add.node_name ); |
| break; |
| |
| case InternalType_NameServerDelete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal NameServer delete request for node=%s\n" |
| , method_name, __LINE__, recv_msg->u.nameserver_delete.node_name); |
| |
| // Queue the nameserver delete request for processing by a worker thread. |
| ReqQueue.enqueueNameServerDeleteReq( recv_msg->u.nameserver_delete.req_nid |
| , recv_msg->u.nameserver_delete.req_pid |
| , recv_msg->u.nameserver_delete.req_verifier |
| , recv_msg->u.nameserver_delete.node_name ); |
| break; |
| |
| case InternalType_NodeAdd: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal node add request for node_name=%s, " |
| "first_core=%d, last_core=%d, " |
| "processors=%d, roles=%d\n" |
| , method_name, __LINE__ |
| , recv_msg->u.node_add.node_name |
| , recv_msg->u.node_add.first_core |
| , recv_msg->u.node_add.last_core |
| , recv_msg->u.node_add.processors |
| , recv_msg->u.node_add.roles ); |
| |
| // Queue the node add request for processing by a worker thread. |
| ReqQueue.enqueueNodeAddReq( recv_msg->u.node_add.req_nid |
| , recv_msg->u.node_add.req_pid |
| , recv_msg->u.node_add.req_verifier |
| , recv_msg->u.node_add.node_name |
| , recv_msg->u.node_add.first_core |
| , recv_msg->u.node_add.last_core |
| , recv_msg->u.node_add.processors |
| , recv_msg->u.node_add.roles ); |
| break; |
| |
| case InternalType_Clone: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal clone request, process (%d, %d)" |
| " %s\n", method_name, __LINE__, |
| recv_msg->u.clone.nid, recv_msg->u.clone.os_pid, |
| (recv_msg->u.clone.backup?" Backup":"")); |
| |
| ReqQueue.enqueueCloneReq( &recv_msg->u.clone ); |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Device: |
| ReqQueue.enqueueDeviceReq(recv_msg->u.device.ldev_name); |
| break; |
| #endif |
| |
| case InternalType_Shutdown: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal shutdown request for level=%d\n", method_name, __LINE__, recv_msg->u.shutdown.level); |
| |
| // Queue the shutdown request for processing by a worker thread. |
| ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level ); |
| break; |
| |
| case InternalType_NodeDelete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal node delete request for pnid=%d\n" |
| , method_name, __LINE__, recv_msg->u.node_delete.pnid); |
| |
| // Queue the node delete request for processing by a worker thread. |
| ReqQueue.enqueueNodeDeleteReq( recv_msg->u.node_delete.req_nid |
| , recv_msg->u.node_delete.req_pid |
| , recv_msg->u.node_delete.req_verifier |
| , recv_msg->u.node_delete.pnid ); |
| break; |
| |
| case InternalType_Down: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid); |
| |
| // Queue the node down request for processing by a worker thread. |
| ReqQueue.enqueueDownReq( recv_msg->u.down.pnid ); |
| break; |
| |
| case InternalType_NodeName: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal node name request (%s to %s)\n", method_name, __LINE__, recv_msg->u.node_name.current_name, recv_msg->u.node_name.new_name); |
| |
| // Queue the node name request for processing by a worker thread. |
| ReqQueue.enqueueNodeNameReq( recv_msg->u.node_name.req_nid |
| , recv_msg->u.node_name.req_pid |
| , recv_msg->u.node_name.req_verifier |
| , recv_msg->u.node_name.current_name |
| , recv_msg->u.node_name.new_name ); |
| break; |
| |
| case InternalType_SoftNodeDown: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal soft node down request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid); |
| |
| // Queue the node down request for processing by a worker thread. |
| ReqQueue.enqueueSoftNodeDownReq( recv_msg->u.down.pnid ); |
| break; |
| |
| case InternalType_SoftNodeUp: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal soft node up request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid); |
| |
| // Queue the node up request for processing by a worker thread. |
| ReqQueue.enqueueSoftNodeUpReq( recv_msg->u.up.pnid ); |
| break; |
| |
| case InternalType_Up: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid); |
| |
| // Queue the node up request for processing by a worker thread. |
| ReqQueue.enqueueUpReq( recv_msg->u.up.pnid, NULL, -1 ); |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Dump: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n", |
| method_name, __LINE__, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| lnode = Nodes->GetLNode( recv_msg->u.dump.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.dump.pid); |
| |
| if (process) |
| { |
| int verifier = recv_msg->u.dump.verifier; |
| if ( (verifier == -1) || (verifier == process->GetVerifier()) ) |
| { |
| process->DumpBegin(recv_msg->u.dump.dumper_nid, |
| recv_msg->u.dump.dumper_pid, |
| recv_msg->u.dump.dumper_verifier, |
| recv_msg->u.dump.core_file); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d, verifier=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid, |
| recv_msg->u.dump.verifier); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_1, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_2, SQ_LOG_ERR, buf); |
| } |
| } |
| |
| break; |
| |
| case InternalType_DumpComplete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n", |
| method_name, __LINE__, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| lnode = Nodes->GetLNode( recv_msg->u.dump.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.dump.pid); |
| |
| if (process) |
| { |
| int verifier = recv_msg->u.dump.verifier; |
| if ( (verifier == -1) || (verifier == process->GetVerifier()) ) |
| { |
| process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d, verifier=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid, |
| recv_msg->u.dump.verifier); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_3, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| // Dump completion handled in CProcess::Exit() |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d for dump complete target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_4, SQ_LOG_ERR, buf); |
| } |
| } |
| break; |
| #endif |
| |
| case InternalType_Exit: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit.name, recv_msg->u.exit.nid, recv_msg->u.exit.pid); |
| #ifndef NAMESERVER_PROCESS |
| ReqQueue.enqueueExitReq( &recv_msg->u.exit ); |
| #else |
| ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns ); |
| #endif |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Event: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal event request\n", method_name, __LINE__); |
| if ( MyNode->IsMyNode(recv_msg->u.event.nid) ) |
| { |
| if (trace_settings & TRACE_SYNC) |
| trace_printf("%s@%d - processing event for (%d, %d)\n", method_name, __LINE__, recv_msg->u.event.nid, recv_msg->u.event.pid); |
| |
| lnode = Nodes->GetLNode( recv_msg->u.event.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.event.pid); |
| |
| if (process) |
| { |
| int verifier = recv_msg->u.dump.verifier; |
| if ( (verifier == -1) || (verifier == process->GetVerifier()) ) |
| { |
| process->GenerateEvent (recv_msg->u.event.event_id, |
| recv_msg->u.event.length, |
| &recv_msg->u.event.data); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d, verifier=%d for event=%d\n", method_name, |
| recv_msg->u.event.nid, recv_msg->u.event.pid, |
| recv_msg->u.event.verifier, recv_msg->u.event.event_id); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid" |
| "=%d, pid=%d for processing event.\n", |
| method_name, |
| recv_msg->u.event.nid, recv_msg->u.event.pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_6, SQ_LOG_ERR, |
| buf); |
| } |
| } |
| } |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_IoData: |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_REDIRECTION)) |
| trace_printf("%s@%d - Internal IO data request\n", method_name, __LINE__); |
| if ( MyNode->IsMyNode(recv_msg->u.iodata.nid) ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REDIRECTION)) |
| trace_printf("%s@%d - processing IO Data for (%d, %d)\n", method_name, __LINE__, recv_msg->u.iodata.nid, recv_msg->u.iodata.pid); |
| |
| lnode = Nodes->GetLNode( recv_msg->u.iodata.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.iodata.pid); |
| |
| if (process) |
| { |
| int fd; |
| if (recv_msg->u.iodata.ioType == STDIN_DATA) |
| { |
| fd = process->FdStdin(); |
| } |
| else |
| { |
| fd = process->FdStdout(); |
| } |
| Redirector.disposeIoData(fd, |
| recv_msg->u.iodata.length, |
| recv_msg->u.iodata.data); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid" |
| "=%d, pid=%d for processing IO Data.\n", |
| method_name, |
| recv_msg->u.iodata.nid, recv_msg->u.iodata.pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_7, SQ_LOG_ERR, |
| buf); |
| } |
| } |
| } |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_StdinReq: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal STDIN request\n", method_name, __LINE__); |
| if ( !MyNode->IsMyNode(recv_msg->u.stdin_req.supplier_nid) ) |
| { |
| break; |
| |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REDIRECTION)) |
| trace_printf("%s@%d - stdin request from (%d,%d)" |
| ", type=%d, for supplier (%d, %d)\n", |
| method_name, __LINE__, |
| recv_msg->u.stdin_req.nid, |
| recv_msg->u.stdin_req.pid, |
| recv_msg->u.stdin_req.reqType, |
| recv_msg->u.stdin_req.supplier_nid, |
| recv_msg->u.stdin_req.supplier_pid); |
| |
| lnode = Nodes->GetLNode( recv_msg->u.stdin_req.nid ); |
| if ( lnode == NULL ) |
| { |
| break; |
| } |
| process = lnode->GetProcessL(recv_msg->u.stdin_req.pid); |
| |
| if (process) |
| { |
| if (recv_msg->u.stdin_req.reqType == STDIN_REQ_DATA) |
| { |
| // Set up to forward stdin data to requester. |
| // Save file descriptor associated with stdin |
| // so can find the redirector object later. |
| CProcess *supProcess; |
| lnode = Nodes->GetLNode( recv_msg->u.stdin_req.supplier_nid ); |
| if ( lnode ) |
| { |
| supProcess = lnode->GetProcessL ( recv_msg->u.stdin_req.supplier_pid ); |
| if (supProcess) |
| { |
| int fd; |
| fd = Redirector.stdinRemote(supProcess->infile(), |
| recv_msg->u.stdin_req.nid, |
| recv_msg->u.stdin_req.pid); |
| process->FdStdin(fd); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process " |
| "nid=%d, pid=%d for stdin data request.\n", |
| method_name, |
| recv_msg->u.stdin_req.supplier_nid, |
| recv_msg->u.stdin_req.supplier_pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_8, |
| SQ_LOG_ERR, buf); |
| } |
| } |
| } |
| else if (recv_msg->u.stdin_req.reqType == STDIN_FLOW_OFF) |
| { |
| Redirector.stdinOff(process->FdStdin()); |
| } |
| else if (recv_msg->u.stdin_req.reqType == STDIN_FLOW_ON) |
| { |
| Redirector.stdinOn(process->FdStdin()); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d for stdin data request.\n", method_name, |
| recv_msg->u.stdin_req.nid, |
| recv_msg->u.stdin_req.pid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_9, SQ_LOG_INFO, buf); |
| } |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Kill: |
| // Queue the kill request for processing by a worker thread. |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal kill request for (%d, %d), abort =%d\n", method_name, __LINE__, recv_msg->u.kill.nid, recv_msg->u.kill.pid, recv_msg->u.kill.persistent_abort); |
| |
| ReqQueue.enqueueKillReq( &recv_msg->u.kill ); |
| break; |
| #endif |
| |
| case InternalType_Process: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal process request\n", method_name, __LINE__); |
| if ( MyNode->IsMyNode(recv_msg->u.process.nid) ) |
| { // Need to create process on this node. |
| // Queue process creation request for handling by worker thread |
| #ifdef NAMESERVER_PROCESS |
| ReqQueue.enqueueNewProcNsReq( &recv_msg->u.process ); |
| #endif |
| #ifndef NAMESERVER_PROCESS |
| ReqQueue.enqueueNewProcReq( &recv_msg->u.process ); |
| #endif |
| } |
| break; |
| |
| case InternalType_ProcessInit: |
| if ( MyNode->IsMyNode(recv_msg->u.processInit.origNid) ) |
| { // New process request originated on this node |
| ReqQueue.enqueueProcInitReq( &recv_msg->u.processInit ); |
| } |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Open: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal open request for (%d, %d), opened (%d, %d)\n", method_name, __LINE__, recv_msg->u.open.nid, recv_msg->u.open.pid, recv_msg->u.open.opened_nid, recv_msg->u.open.opened_pid); |
| |
| ReqQueue.enqueueOpenReq( &recv_msg->u.open ); |
| break; |
| #endif |
| |
| case InternalType_SchedData: |
| #ifdef EXCHANGE_CPU_SCHEDULING_DATA |
| SaveSchedData( recv_msg ); |
| #endif |
| break; |
| |
| case InternalType_Set: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal set request\n", method_name, __LINE__); |
| ReqQueue.enqueueSetReq( &recv_msg->u.set ); |
| break; |
| |
| case InternalType_UniqStr: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal unique string request\n", method_name, __LINE__); |
| ReqQueue.enqueueUniqStrReq( &recv_msg->u.uniqstr ); |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Sync: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Internal sync request for" |
| " Node %s, pnid=%d, SyncType=%d\n", |
| method_name, __LINE__, Node[pnid]->GetName(), pnid, |
| recv_msg->u.sync.type); |
| switch (recv_msg->u.sync.type ) |
| { |
| case SyncType_TmData: |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d), (phase=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, MyNode->GetPhase()); |
| if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready ) |
| { |
| MyNode->CheckActivationPhase(); |
| } |
| if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() == Phase_Ready ) |
| { |
| if ( MyNode->GetTmSyncState() == SyncState_Null ) |
| { |
| // Begin a Slave Sync Start |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Slave Sync Start on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid); |
| tmSyncPNid_ = pnid; |
| Node[pnid]->SetTmSyncState( recv_msg->u.sync.state ); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() )); |
| } |
| Monitor->CoordinateTmDataBlock( &recv_msg->u.sync ); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Sync State Collision! Node %s (pnid=%d) TmSyncState=(%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState()) ); |
| if ( MyNode->GetTmSyncState() == SyncState_Continue ) |
| { |
| if ( pnid > tmSyncPNid_ ) |
| // highest node id will continue |
| { |
| // They take priority ... we abort |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Aborting Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[Monitor->tmSyncPNid_]->GetName(), Monitor->tmSyncPNid_); |
| MyNode->SetTmSyncState( SyncState_Null ); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) ); |
| Monitor->ReQueue_TmSync (false); |
| // Continue with other node's Slave TmSync Start request |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid); |
| tmSyncPNid_ = pnid; |
| Node[pnid]->SetTmSyncState( recv_msg->u.sync.state ); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() )); |
| } |
| Monitor->CoordinateTmDataBlock (&recv_msg->u.sync); |
| } |
| } |
| else if ( MyNode->GetTmSyncState() == SyncState_Start ) |
| { |
| // Check if they continue with Master Sync Start |
| if ( pnid > MyPNID ) |
| // highest node id will continue |
| { |
| // They take priority ... we abort |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Aborted Master Sync Start\n", method_name, __LINE__); |
| MyNode->SetTmSyncState( SyncState_Null ); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) ); |
| // Continue with other node's Slave TmSync Start request |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid); |
| tmSyncPNid_ = pnid; |
| Node[pnid]->SetTmSyncState( recv_msg->u.sync.state ); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, Node[pnid]->GetTmSyncState(), SyncStateString( Node[pnid]->GetTmSyncState() )); |
| } |
| Monitor->CoordinateTmDataBlock (&recv_msg->u.sync); |
| } |
| else |
| { |
| // We continue and assume they abort |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Continuing with Master Sync Start\n", method_name, __LINE__); |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Invalid TmSync_State\n", method_name, __LINE__); |
| } |
| } |
| } |
| break; |
| |
| case SyncType_TmSyncState: |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid); |
| break; |
| |
| default: |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from pnid=%d.\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_10, SQ_LOG_ERR, buf); |
| } |
| } |
| break; |
| #endif |
| |
| default: |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unknown Internal message received, Physical Node=%d.\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_11, SQ_LOG_ERR, buf); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::HandleMyNodeMsg (struct internal_msg_def *recv_msg, |
| int pnid) |
| { |
| const char method_name[] = "CCluster::HandleMyNodeMsg"; |
| TRACE_ENTRY; |
| |
| #ifndef NAMESERVER_PROCESS |
| CProcess *process; |
| CLNode *lnode; |
| #endif |
| |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| trace_printf("%s@%d - Marking object as replicated, msg type=%d\n", |
| method_name, __LINE__, recv_msg->type); |
| switch (recv_msg->type) |
| { |
| |
| case InternalType_Null: |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| trace_printf("%s@%d - Physical Node pnid=n%d has nothing to " |
| "update. \n", method_name, __LINE__, pnid); |
| break; |
| |
| case InternalType_ActivateSpare: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal activate spare request, spare pnid=%d, down pnid=%d\n" |
| , method_name, __LINE__ |
| , recv_msg->u.activate_spare.spare_pnid |
| , recv_msg->u.activate_spare.down_pnid); |
| break; |
| |
| case InternalType_NameServerAdd: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal NameServer add request for node_name=%s\n" |
| , method_name, __LINE__ |
| , recv_msg->u.nameserver_add.node_name ); |
| |
| // Queue the nameserver add request for processing by a worker thread. |
| ReqQueue.enqueueNameServerAddReq( recv_msg->u.nameserver_add.req_nid |
| , recv_msg->u.nameserver_add.req_pid |
| , recv_msg->u.nameserver_add.req_verifier |
| , recv_msg->u.nameserver_add.node_name ); |
| break; |
| |
| case InternalType_NameServerDelete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal NameServer delete request for node=%s\n" |
| , method_name, __LINE__, recv_msg->u.nameserver_delete.node_name); |
| |
| // Queue the nameserver delete request for processing by a worker thread. |
| ReqQueue.enqueueNameServerDeleteReq( recv_msg->u.nameserver_delete.req_nid |
| , recv_msg->u.nameserver_delete.req_pid |
| , recv_msg->u.nameserver_delete.req_verifier |
| , recv_msg->u.nameserver_delete.node_name ); |
| break; |
| |
| case InternalType_NodeAdd: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal node add request for node_name=%s, " |
| "first_core=%d, last_core=%d, " |
| "processors=%d, roles=%d\n" |
| , method_name, __LINE__ |
| , recv_msg->u.node_add.node_name |
| , recv_msg->u.node_add.first_core |
| , recv_msg->u.node_add.last_core |
| , recv_msg->u.node_add.processors |
| , recv_msg->u.node_add.roles ); |
| |
| // Queue the node add request for processing by a worker thread. |
| ReqQueue.enqueueNodeAddReq( recv_msg->u.node_add.req_nid |
| , recv_msg->u.node_add.req_pid |
| , recv_msg->u.node_add.req_verifier |
| , recv_msg->u.node_add.node_name |
| , recv_msg->u.node_add.first_core |
| , recv_msg->u.node_add.last_core |
| , recv_msg->u.node_add.processors |
| , recv_msg->u.node_add.roles ); |
| break; |
| |
| case InternalType_Clone: |
| #ifndef NAMESERVER_PROCESS |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal clone request, completed replicating process (%d, %d) %s\n", method_name, __LINE__, recv_msg->u.clone.nid, recv_msg->u.clone.os_pid, (recv_msg->u.clone.backup?" Backup":"")); |
| #else |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal clone request, process (%d, %d)" |
| " %s\n", method_name, __LINE__, |
| recv_msg->u.clone.nid, recv_msg->u.clone.os_pid, |
| (recv_msg->u.clone.backup?" Backup":"")); |
| |
| ReqQueue.enqueueCloneReq( &recv_msg->u.clone ); |
| #endif |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Device: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal device request, completed device processing for ldev %s\n", method_name, __LINE__, recv_msg->u.device.ldev_name); |
| break; |
| #endif |
| |
| case InternalType_Shutdown: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal shutdown request for level=%d\n", method_name, __LINE__, recv_msg->u.shutdown.level); |
| |
| // Queue the shutdown request for processing by a worker thread. |
| ReqQueue.enqueueShutdownReq( recv_msg->u.shutdown.level ); |
| break; |
| |
| case InternalType_NodeDelete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Internal node delete request for pnid=%d\n" |
| , method_name, __LINE__, recv_msg->u.node_delete.pnid); |
| |
| // Queue the node delete request for processing by a worker thread. |
| ReqQueue.enqueueNodeDeleteReq( recv_msg->u.node_delete.req_nid |
| , recv_msg->u.node_delete.req_pid |
| , recv_msg->u.node_delete.req_verifier |
| , recv_msg->u.node_delete.pnid ); |
| break; |
| |
| case InternalType_Down: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid); |
| break; |
| |
| case InternalType_NodeName: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal node name request (%s to %s)\n", method_name, __LINE__, recv_msg->u.node_name.current_name, recv_msg->u.node_name.new_name); |
| |
| // Queue the node name request for processing by a worker thread. |
| ReqQueue.enqueueNodeNameReq( recv_msg->u.node_name.req_nid |
| , recv_msg->u.node_name.req_pid |
| , recv_msg->u.node_name.req_verifier |
| , recv_msg->u.node_name.current_name |
| , recv_msg->u.node_name.new_name ); |
| break; |
| |
| case InternalType_SoftNodeDown: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal soft down node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.down.pnid); |
| break; |
| |
| case InternalType_SoftNodeUp: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal soft up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid); |
| break; |
| |
| case InternalType_Up: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal up node request for pnid=%d\n", method_name, __LINE__, recv_msg->u.up.pnid); |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Dump: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal dump request for nid=%d, pid=%d\n", |
| method_name, __LINE__, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| |
| lnode = Nodes->GetLNode( recv_msg->u.dump.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.dump.pid); |
| |
| if (process) |
| { |
| int verifier = recv_msg->u.dump.verifier; |
| if ( (verifier == -1) || (verifier == process->GetVerifier()) ) |
| { |
| process->DumpBegin(recv_msg->u.dump.dumper_nid, |
| recv_msg->u.dump.dumper_pid, |
| recv_msg->u.dump.dumper_verifier, |
| recv_msg->u.dump.core_file); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d, verifier=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid, |
| recv_msg->u.dump.verifier); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_1, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_2, SQ_LOG_ERR, buf); |
| } |
| } |
| break; |
| |
| case InternalType_DumpComplete: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal dump-complete request for nid=%d, pid=%d\n", |
| method_name, __LINE__, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| lnode = Nodes->GetLNode( recv_msg->u.dump.nid ); |
| if ( lnode ) |
| { |
| process = lnode->GetProcessL(recv_msg->u.dump.pid); |
| |
| if (process) |
| { |
| int verifier = recv_msg->u.dump.verifier; |
| if ( (verifier == -1) || (verifier == process->GetVerifier()) ) |
| { |
| process->DumpEnd(recv_msg->u.dump.status, recv_msg->u.dump.core_file); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d, verifier=%d for dump target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid, |
| recv_msg->u.dump.verifier); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_3, SQ_LOG_ERR, buf); |
| } |
| } |
| else |
| { |
| // Dump completion handled in CProcess::Exit() |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't find process nid=%d, " |
| "pid=%d for dump complete target.\n", method_name, |
| recv_msg->u.dump.nid, recv_msg->u.dump.pid); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_4, SQ_LOG_ERR, buf); |
| } |
| } |
| break; |
| #endif |
| |
| case InternalType_Exit: |
| // Final process exit logic is done in Process_Exit, not here |
| // as in the past. |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit_ns.name, recv_msg->u.exit_ns.nid, recv_msg->u.exit_ns.pid); |
| #ifdef NAMESERVER_PROCESS |
| ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns ); |
| #endif |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Event: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal event request\n", method_name, __LINE__); |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_IoData: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal IO data request\n", method_name, __LINE__); |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_StdinReq: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal STDIN request\n", method_name, __LINE__); |
| break; |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Kill: |
| // Queue the kill request for processing by a worker thread. |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal kill request for (%d, %d), abort =%d\n", method_name, __LINE__, recv_msg->u.kill.nid, recv_msg->u.kill.pid, recv_msg->u.kill.persistent_abort); |
| |
| ReqQueue.enqueueKillReq( &recv_msg->u.kill ); |
| break; |
| #endif |
| |
| case InternalType_Process: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal process request, completed process replication for (%d, %d) %s\n", method_name, __LINE__, recv_msg->u.process.pid, recv_msg->u.process.nid, (recv_msg->u.process.backup?" Backup":"")); |
| break; |
| |
| case InternalType_ProcessInit: |
| // No action needed |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Open: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal open request, completed open replication, " |
| "(%d, %d:%d) opened (%d, %d:%d)\n", |
| method_name, __LINE__, |
| recv_msg->u.open.nid, |
| recv_msg->u.open.pid, |
| recv_msg->u.open.verifier, |
| recv_msg->u.open.opened_nid, |
| recv_msg->u.open.opened_pid, |
| recv_msg->u.open.opened_verifier); |
| break; |
| #endif |
| |
| case InternalType_SchedData: |
| // No action needed |
| break; |
| |
| case InternalType_Set: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal set request, completed replicating key %s::%s\n", method_name, __LINE__, recv_msg->u.set.group, recv_msg->u.set.key); |
| break; |
| |
| case InternalType_UniqStr: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Internal unique string request, completed replicating (%d, %d)\n", method_name, __LINE__, recv_msg->u.uniqstr.nid, recv_msg->u.uniqstr.id); |
| break; |
| |
| #ifndef NAMESERVER_PROCESS |
| case InternalType_Sync: |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Internal sync request for node %s, pnid=%d, SyncType=%d\n" |
| , method_name, __LINE__, Node[pnid]->GetName(), pnid, recv_msg->u.sync.type); |
| switch (recv_msg->u.sync.type ) |
| { |
| case SyncType_TmData: |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID); |
| tmSyncPNid_ = MyPNID; |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Sync communicated, tmSyncPNid_=%d\n", method_name, __LINE__, tmSyncPNid_); |
| if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready ) |
| { |
| MyNode->CheckActivationPhase(); |
| } |
| if ( MyNode->GetTmSyncState() == SyncState_Start && |
| MyNode->GetPhase() == Phase_Ready && |
| MyNode->GetLNodesCount() > 1 ) |
| { |
| // Begin a Slave Sync Start to other |
| // logical nodes in my physical node |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Slave Sync Start on local node %s, pnid=%d\n", method_name, __LINE__, Node[pnid]->GetName(), pnid); |
| Monitor->CoordinateTmDataBlock( &recv_msg->u.sync ); |
| } |
| break; |
| |
| case SyncType_TmSyncState: |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - TMSYNC(TmSyncState) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID); |
| break; |
| |
| default: |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unknown SyncType from node %s, pnid=%d during processing local SyncType.\n", method_name, Node[pnid]->GetName(), pnid); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_5, SQ_LOG_ERR, buf); |
| } |
| } |
| break; |
| #endif |
| |
| default: |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unknown Internal message received during processing local SyncType for pnid=%d.\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_HANDLEMYNODE_6, SQ_LOG_ERR, buf); |
| } |
| |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| |
| |
| bool CCluster::responsive() |
| { |
| const char method_name[] = "CCluster::responsive"; |
| TRACE_ENTRY; |
| |
| int barrierDiff = barrierCount_ - barrierCountSaved_; |
| |
| // if no difference in barrier count, sync thread is not responsive |
| if ( !barrierDiff && isMonInitComplete() ) |
| { |
| // this proc is called every SYNC_MAX_RESPONSIVE+1 secs |
| cumulativeDelaySec_ += CCluster::SYNC_MAX_RESPONSIVE + 1; |
| |
| monSyncResponsive_ = false; // sync thread is no longer responsive |
| |
| if ( CommType == CommType_InfiniBand ) |
| { |
| // if sync thread is stuck in mpi call, one of the following checks will be true |
| if ( inBarrier_ || inAllGather_ || inCommDup_ ) |
| { |
| mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_, |
| ( ( (inBarrier_ << 1) | inAllGather_ ) << 1 ) | inCommDup_); |
| } |
| else // non-mpi took quite long |
| { |
| mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_); |
| } |
| } |
| else |
| { |
| // if sync thread is stuck in mpi call |
| if ( inBarrier_ ) |
| { |
| mem_log_write(MON_CLUSTER_RESPONSIVE_1, cumulativeDelaySec_, |
| inBarrier_); |
| } |
| else // non-mpi took quite long |
| { |
| mem_log_write(MON_CLUSTER_RESPONSIVE_2, cumulativeDelaySec_); |
| } |
| } |
| } |
| else if (barrierDiff < syncMinPerSec_) |
| { |
| mem_log_write(MON_CLUSTER_RESPONSIVE_3, barrierDiff, syncMinPerSec_); |
| cumulativeDelaySec_ = 0; |
| monSyncResponsive_ = true; // slow but responsive |
| } |
| else |
| { |
| cumulativeDelaySec_ = 0; |
| monSyncResponsive_ = true; // truely responsive |
| } |
| |
| barrierCountSaved_ = barrierCount_; |
| if ( CommType == CommType_InfiniBand ) |
| { |
| allGatherCountSaved_ = allGatherCount_; |
| commDupCountSaved_ = commDupCount_; |
| } |
| |
| TRACE_EXIT; |
| |
| return monSyncResponsive_; |
| } |
| |
| |
| int CCluster::MPIAllgather(void *sendbuf, int sendcount, MPI_Datatype sendtype, |
| void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm Comm) |
| { |
| const char method_name[] = "CCluster::MPIAllGather"; |
| TRACE_ENTRY; |
| |
| inAllGather_ = true; |
| |
| int rc = MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, Comm); |
| |
| inAllGather_ = false; |
| |
| allGatherCount_++; |
| |
| TRACE_EXIT; |
| return rc; |
| } |
| |
| bool CCluster::ReinitializeConfigCluster( bool nodeAdded, int pnid ) |
| { |
| const char method_name[] = "CCluster::ReinitializeConfigCluster"; |
| TRACE_ENTRY; |
| |
| int rs = true; |
| CNode *pnode; |
| |
| // Update node membership in the cluster |
| |
| if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) |
| trace_printf( "%s@%d - Configured physical nodes count=%d\n" |
| , method_name, __LINE__ |
| , GetConfigPNodesCount() ); |
| |
| if (nodeAdded) |
| { |
| // Add node to monitor's view of the cluster |
| pnode = Nodes->AddNode( pnid ); |
| if ( !pnode ) |
| { |
| rs = false; |
| } |
| } |
| else |
| { |
| // Delete node from monitor's view of the cluster |
| if ( !Nodes->DeleteNode( pnid ) ) |
| { |
| rs = false; |
| } |
| } |
| |
| if ( rs ) |
| { |
| CClusterConfig *clusterConfig = Nodes->GetClusterConfig(); |
| configPNodesCount_ = clusterConfig->GetPNodesCount(); |
| Nodes->UpdateCluster(); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) |
| trace_printf( "%s@%d - Configured physical nodes count=%d\n" |
| , method_name, __LINE__ |
| , GetConfigPNodesCount() ); |
| |
| TRACE_EXIT; |
| return( rs ); |
| } |
| |
| void CCluster::InitializeConfigCluster( void ) |
| { |
| #ifndef NAMESERVER_PROCESS // nameserver is running in agent mode |
| int rc; |
| #endif |
| |
| const char method_name[] = "CCluster::InitializeConfigCluster"; |
| TRACE_ENTRY; |
| |
| int worldSize = 0; |
| MPI_Comm_size (MPI_COMM_WORLD, &worldSize); |
| #ifdef NAMESERVER_PROCESS |
| if ( !IsRealCluster ) |
| { |
| char *nodes = getenv( "SQ_VIRTUAL_NODES" ); |
| worldSize = atoi(nodes); |
| if ( worldSize <= 0 ) |
| { |
| worldSize = 1; |
| } |
| } |
| #endif |
| CClusterConfig *clusterConfig = Nodes->GetClusterConfig(); |
| configPNodesCount_ = clusterConfig->GetPNodesCount(); |
| int rankToPnid[configPNodesCount_]; |
| |
| #ifdef NAMESERVER_PROCESS |
| currentNodes_ = 1; // non-master Name Servers join set through master Name Server |
| #else |
| if (IAmIntegrating || IsAgentMode) |
| { |
| currentNodes_ = 1; // non-master monitors join cluster through master monitor |
| } |
| else |
| { |
| currentNodes_ = worldSize; |
| } |
| #endif |
| |
| if ( !IsRealCluster ) |
| { |
| // Set virtual cluster size to collective size |
| MPI_Comm_size (MPI_COMM_WORLD, &configPNodesCount_); |
| #ifdef NAMESERVER_PROCESS |
| configPNodesCount_ = worldSize; |
| #endif |
| |
| // For virtual cluster set physical node id equal to rank |
| for (int i=0; i<worldSize; ++i) |
| { |
| rankToPnid[i] = i; |
| |
| // Set bit indicating node is up |
| upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK)); |
| } |
| } |
| else |
| { |
| for (int i=0; i<configPNodesCount_; ++i) |
| { |
| rankToPnid[i] = i; |
| rankToPnid[i] = rankToPnid[i]; // make compiler happy |
| } |
| } |
| |
| // Build the monitor's configured view of the cluster |
| if ( IsRealCluster ) |
| { // Map node name to physical node id |
| // (for virtual nodes physical node equals "rank" (previously set)) |
| if (MyPNID == -1) |
| { |
| MyPNID = clusterConfig->GetPNid( Node_name ); |
| if (MyPNID == -1) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s@%d] Can't find node name=%s in cluster configuration\n", |
| method_name, __LINE__, Node_name ); |
| mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_1, SQ_LOG_CRIT, buf); |
| |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| } |
| } |
| |
| Nodes->AddNodes( ); |
| MyNode = Nodes->GetNode(MyPNID); |
| Nodes->SetupCluster( &Node, &LNode, &indexToPnid_ ); |
| |
| if ( CommType == CommType_Sockets ) |
| { |
| InitServerSock(); |
| } |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d (MasterMonitor) IAmIntegrating=%d," |
| " IsAgentMode=%d, IsMaster=%d," |
| " MasterMonitorName=%s, Node_name=%s\n" |
| , method_name, __LINE__ |
| , IAmIntegrating |
| , IsAgentMode, IsMaster, MasterMonitorName, Node_name ); |
| } |
| |
| if (IAmIntegrating || IsAgentMode) |
| { |
| #ifndef NAMESERVER_PROCESS |
| int TmLeaderPNid = -1; |
| if (IsMaster) |
| { |
| tmLeaderNid_ = Nodes->GetFirstNid(); |
| TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid(); |
| } |
| #endif |
| // Monitors processes in AGENT mode in a real cluster initialize all |
| // remote nodes to a down state. The master monitor and the joining |
| // monitors will set the joining node state to up as part of the node |
| // re-integration processing as monitor processes join the cluster |
| // through the master. |
| for (int i=0; i < clusterConfig->GetPNodesCount(); i++) |
| { |
| if (Node[indexToPnid_[i]]) |
| { |
| if (Node[indexToPnid_[i]]->GetPNid() == MyPNID) |
| { // Set bit indicating node is up |
| upNodes_.upNodes[indexToPnid_[i]/MAX_NODE_BITMASK] |= |
| (1ull << (indexToPnid_[i]%MAX_NODE_BITMASK)); |
| } |
| else |
| { // Set node state to down |
| Node[indexToPnid_[i]]->SetState( State_Down ); |
| #ifndef NAMESERVER_PROCESS |
| if (IsMaster) |
| { |
| if (TmLeaderPNid == indexToPnid_[i]) |
| { |
| AssignTmLeader(indexToPnid_[i], false); |
| } |
| } |
| #endif |
| } |
| } |
| } |
| } |
| #ifndef NAMESERVER_PROCESS // nameserver is running in agent mode |
| else |
| { |
| char *nodeNames = 0; |
| if ( IsRealCluster ) |
| { |
| if (trace_settings & TRACE_INIT) |
| trace_printf( "%s@%d Collecting port numbers and node names, " |
| "configPNodesCount_=%d, worldSize=%d, pnid=%d (%s:%s)\n" |
| "MyCommPort=%s\nMySyncPort=%s\n" |
| , method_name, __LINE__ |
| , GetConfigPNodesCount(), worldSize |
| , MyPNID, MyNode->GetName(), MyNode->GetCommPort() |
| , MyCommPort, MySyncPort ); |
| |
| bool nodeStatus[GetConfigPNodesCount()]; |
| for (int i=0; i<GetConfigPNodesCount(); ++i) |
| { |
| nodeStatus[i] = false; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) |
| trace_printf( "%s@%d - nodeStatus[%d]=%d\n" |
| , method_name, __LINE__, i, nodeStatus[i] ) ; |
| } |
| |
| // Collect comm port info from other monitors |
| char *commPortNums = new char[worldSize * MPI_MAX_PORT_NAME]; |
| rc = MPI_Allgather (MyCommPort, MPI_MAX_PORT_NAME, MPI_CHAR, commPortNums, |
| MPI_MAX_PORT_NAME, MPI_CHAR, MPI_COMM_WORLD); |
| if (rc != MPI_SUCCESS) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n", |
| method_name, __LINE__, ErrorMsg(rc)); |
| mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf); |
| |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| |
| // Collect sync port info from other monitors |
| char *syncPortNums = new char[worldSize * MPI_MAX_PORT_NAME]; |
| rc = MPI_Allgather (MySyncPort, MPI_MAX_PORT_NAME, MPI_CHAR, syncPortNums, |
| MPI_MAX_PORT_NAME, MPI_CHAR, MPI_COMM_WORLD); |
| if (rc != MPI_SUCCESS) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n", |
| method_name, __LINE__, ErrorMsg(rc)); |
| mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_2, SQ_LOG_CRIT, buf); |
| |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| |
| // Exchange Node Names with collective |
| nodeNames = new char[worldSize * MPI_MAX_PROCESSOR_NAME]; |
| rc = MPI_Allgather (Node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, |
| nodeNames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, |
| MPI_COMM_WORLD); |
| if (rc != MPI_SUCCESS) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n", |
| method_name, __LINE__, ErrorMsg(rc)); |
| mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_3, SQ_LOG_CRIT, buf); |
| |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| |
| // For each node name received get corresponding CNode object and |
| // store port number in it. |
| char * nodeName; |
| CNode * node; |
| for (int i = 0; i < worldSize; i++) |
| { |
| nodeName = &nodeNames[ i * MPI_MAX_PROCESSOR_NAME ]; |
| node = Nodes->GetNode( nodeName ); |
| if ( node ) |
| { |
| node->SetCommPort( &commPortNums[ i * MPI_MAX_PORT_NAME] ); |
| node->SetSyncPort( &syncPortNums[ i * MPI_MAX_PORT_NAME] ); |
| rankToPnid[i] = node->GetPNid(); |
| nodeStatus[rankToPnid[i]] = true; |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf( "%s@%d rankToPnid[%d]=%d (%s:%s:%s)" |
| "(node=%s,commPort=%s,syncPort=%s)\n" |
| , method_name, __LINE__, i, rankToPnid[i] |
| , node->GetName() |
| , node->GetCommPort() |
| , node->GetSyncPort() |
| , &nodeNames[ i * MPI_MAX_PROCESSOR_NAME] |
| , &commPortNums[ i * MPI_MAX_PORT_NAME] |
| , &syncPortNums[ i * MPI_MAX_PORT_NAME]); |
| } |
| } |
| else |
| { |
| rankToPnid[i] = -1; |
| |
| // Unexpectedly could not map node name to CNode object |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s@%d] Unable to find node " |
| "object for node %s\n", method_name, __LINE__, |
| nodeName ); |
| mon_log_write(MON_CLUSTER_INITCONFIGCLUSTER_4, SQ_LOG_CRIT, buf); |
| } |
| } |
| delete [] commPortNums; |
| delete [] syncPortNums; |
| |
| tmLeaderNid_ = Nodes->GetFirstNid(); |
| int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid(); |
| |
| // Any nodes not in the initial MPI_COMM_WORLD are down. |
| for (int i=0; i<GetConfigPNodesCount(); ++i) |
| { |
| if ( nodeStatus[indexToPnid_[i]] == false ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) |
| trace_printf( "%s@%d - nodeStatus[%d]=%d" |
| ", indexToPnid_[%d]=%d\n" |
| , method_name, __LINE__ |
| , i, nodeStatus[i] |
| , i, indexToPnid_[i] ) ; |
| |
| node = Nodes->GetNode(indexToPnid_[i]); |
| if ( node ) node->SetState( State_Down ); |
| // assign new TmLeader if TMLeader node is dead. |
| if (TmLeaderPNid == indexToPnid_[i]) |
| { |
| AssignTmLeader(indexToPnid_[i], false); |
| } |
| } |
| else |
| { // Set bit indicating node is up |
| |
| if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) |
| trace_printf( "%s@%d - nodeStatus[%d]=%d" |
| ", indexToPnid_[%d]=%d\n" |
| , method_name, __LINE__ |
| , i, nodeStatus[i] |
| , i, indexToPnid_[i] ) ; |
| |
| upNodes_.upNodes[indexToPnid_[i]/MAX_NODE_BITMASK] |= |
| (1ull << (indexToPnid_[i]%MAX_NODE_BITMASK)); |
| } |
| } |
| } |
| else |
| { |
| tmLeaderNid_ = 0; |
| } |
| |
| // Initialize communicators for point-to-point communications |
| int myRank; |
| MPI_Comm_rank( MPI_COMM_WORLD, &myRank ); |
| if ( !IsRealCluster ) |
| myRank = MyPNID; |
| |
| InitClusterComm(worldSize, myRank, rankToPnid); |
| if ( CommType == CommType_Sockets ) |
| { |
| InitClusterSocks(worldSize, myRank, nodeNames, rankToPnid); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| for ( int i =0; i < worldSize; i++ ) |
| { |
| trace_printf( "%s@%d socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , rankToPnid[i], socks_[rankToPnid[i]]); |
| } |
| } |
| } |
| |
| if (nodeNames) delete [] nodeNames; |
| } |
| #endif |
| |
| if ( CommType == CommType_Sockets ) |
| { |
| // Allgather() cluster sockets are established as remote |
| // monitor processes join the cluster |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| for ( int i =0; i < clusterConfig->GetPNodesCount() ; i++ ) |
| { |
| trace_printf( "%s@%d %s (%d), state=%s, socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[i]]->GetName() |
| , Node[indexToPnid_[i]]->GetPNid() |
| , StateString(Node[indexToPnid_[i]]->GetState()) |
| , indexToPnid_[i], socks_[indexToPnid_[i]]); |
| } |
| } |
| } |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d upNodes set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , i, upNodes_.upNodes[i]); |
| } |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| // Kill the MPICH hydra_pmi_proxy to prevent it from killing all |
| // processes in cluster when mpirun or monitor processes are killed |
| if (!IsAgentMode || (IsAgentMode && IsMPIChild)) |
| { |
| kill( getppid(), SIGKILL ); |
| } |
| #endif |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::InitClusterComm(int worldSize, int myRank, int * rankToPnid) |
| { |
| const char method_name[] = "CCluster::InitClusterComm"; |
| TRACE_ENTRY; |
| |
| // Compute an array of "colors" for use with MPI_Comm_split. |
| int *splitColors; |
| splitColors = new int[worldSize*worldSize*2]; |
| int *splitOtherNode; |
| splitOtherNode = new int[worldSize*worldSize*2]; |
| int splitRows = 0; |
| for ( int i=0; i<(worldSize*worldSize*2); ++i) |
| { |
| splitColors[i] = MPI_UNDEFINED; |
| splitOtherNode[i] = -1; |
| } |
| |
| int color = 1; |
| bool placed; |
| for (int i = 0; i < worldSize; i++) |
| { |
| for (int j = i+1; j < worldSize; j++) |
| { |
| // Find a free slot for rank "i" to rank "j" |
| |
| placed = false; |
| for (int k=0; k<splitRows; ++k) |
| { |
| if ( splitColors[k*worldSize+i] == MPI_UNDEFINED |
| && splitColors[k*worldSize+j] == MPI_UNDEFINED ) |
| { |
| splitColors[k*worldSize+i] = color; |
| splitColors[k*worldSize+j] = color; |
| placed = true; |
| |
| if (myRank == i) |
| splitOtherNode[k] = j; |
| else if (myRank == j) |
| splitOtherNode[k] = i; |
| break; |
| } |
| } |
| if (!placed) |
| { // Need to use a new row |
| splitColors[splitRows*worldSize+i] = color; |
| splitColors[splitRows*worldSize+j] = color; |
| |
| if (myRank == i) |
| splitOtherNode[splitRows] = j; |
| else if (myRank == j) |
| splitOtherNode[splitRows] = i; |
| |
| ++splitRows; |
| } |
| |
| ++color; |
| } |
| } |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf("%s@%d Created %d splitRows for worldSize=%d, myRank=%d\n", |
| method_name, __LINE__, splitRows, worldSize, myRank); |
| string line; |
| char fragment[50]; |
| for (int i=0; i<splitRows; ++i) |
| { |
| sprintf(fragment, "%s@%d splitColors[%d]=", method_name, __LINE__, |
| i); |
| line = fragment; |
| for (int j=0; j<worldSize; ++j) |
| { |
| sprintf(fragment, " %d,", splitColors[i*worldSize+j]); |
| line += fragment; |
| } |
| line += "\n"; |
| trace_printf(line.c_str()); |
| |
| trace_printf("%s@%d splitOtherNode[%d]=%d\n", method_name, |
| __LINE__, i, splitOtherNode[i]); |
| } |
| } |
| |
| // Create one communicator for each other rank in MPI_COMM_WORLD |
| // This permits point-to-point communication with each rank. |
| int myRankInComm; |
| MPI_Comm ncomm; |
| int nid; |
| |
| for (int nSplit=0; nSplit < splitRows; ++nSplit) |
| { |
| color = splitColors[nSplit*worldSize+myRank]; |
| MPI_Comm_split(MPI_COMM_WORLD, color, myRank, &ncomm); |
| if (ncomm == MPI_COMM_NULL) |
| { |
| if (splitColors[nSplit*worldSize+myRank] != MPI_UNDEFINED) |
| { |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf("%s@%d Rank %d: Unexpected MPI_COMM_NULL from " |
| "MPI_Comm_split, nSplit=%d\n", |
| method_name, __LINE__,myRank, nSplit); |
| } |
| } |
| } |
| else |
| { |
| // Set comms_ (communicators) array element for the |
| // physical node. |
| nid = rankToPnid[splitOtherNode[nSplit]]; |
| comms_[nid] = ncomm; |
| |
| MPI_Comm_rank(ncomm, &myRankInComm); |
| otherMonRank_[nid] = (myRankInComm == 0)? 1: 0; |
| |
| if (trace_settings & TRACE_INIT) |
| { |
| trace_printf("%s@%d Rank %d: MPI_Comm_split %d, color=%d, " |
| "comms_[%d] is orig rank #%d, " |
| "otherMonRank_=%d\n", |
| method_name, __LINE__, |
| myRank, nSplit, color, |
| nid, splitOtherNode[nSplit], |
| otherMonRank_[nid]); |
| } |
| } |
| } |
| |
| delete [] splitColors; |
| delete [] splitOtherNode; |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::HandleReintegrateError( int rc, int err, |
| int pnid, nodeId_t *nodeInfo, |
| bool abortIn ) |
| { |
| const char method_name[] = "CCluster::HandleReintegrateError"; |
| TRACE_ENTRY; |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| switch ( err ) |
| { |
| case Reintegrate_Err1: |
| snprintf(buf, sizeof(buf), "[%s], can't to connect to creator monitor" |
| " port: %s - Error: %s.\n", |
| method_name, IntegratingMonitorPort, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err2: |
| snprintf(buf, sizeof(buf), "[%s], can't merge intercomm to existing " |
| "MPI collective - Error: %s.\n", |
| method_name, ErrorMsg(rc)); |
| |
| break; |
| |
| case Reintegrate_Err3: |
| snprintf(buf, sizeof(buf), "[%s], unable to obtain cluster info " |
| "from creator monitor: %s.\n", method_name, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err4: |
| snprintf(buf, sizeof(buf), "[%s], Failed to send name/port " |
| "to node %d (%s): %s.\n", method_name, pnid, |
| nodeInfo->nodeName, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err5: |
| snprintf(buf, sizeof(buf), "[%s], can't to connect to " |
| " node %d monitor, commPort=%s, syncPort=%s: %s.\n", |
| method_name, pnid, nodeInfo->commPort, |
| nodeInfo->syncPort, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err6: |
| snprintf(buf, sizeof(buf), "[%s], can't merge intercomm " |
| "for node %d: %s.\n", method_name, pnid, |
| ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err7: |
| snprintf(buf, sizeof(buf), "[%s], can't disconnect " |
| "intercomm for node %d: %s.\n", method_name, pnid, |
| ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err8: |
| snprintf(buf, sizeof(buf), "[%s], Failed to send status to creator " |
| "monitor: %s\n", method_name, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err9: |
| snprintf(buf, sizeof(buf), "[%s], Failed to send name/port " |
| "to creator monitor: %s.\n", method_name, ErrorMsg(rc)); |
| break; |
| |
| case Reintegrate_Err10: |
| snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could" |
| " not write to port file). Aborting.\n", method_name); |
| break; |
| |
| case Reintegrate_Err11: |
| snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could" |
| " not open port file). Aborting.\n", method_name); |
| break; |
| |
| case Reintegrate_Err12: |
| snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could" |
| " not initialize local io). Aborting.\n", method_name); |
| break; |
| |
| case Reintegrate_Err13: |
| snprintf(buf, sizeof(buf), "[%s], Monitor initialization failed (could" |
| " not initialize devices). Aborting.\n", method_name); |
| break; |
| |
| case Reintegrate_Err14: |
| snprintf(buf, sizeof(buf), "[%s] Aborting.\n", method_name); |
| break; |
| |
| case Reintegrate_Err15: |
| snprintf(buf, sizeof(buf), "[%s], no connect acknowledgement " |
| "for node %d: %s.\n", method_name, pnid, |
| ErrorMsg(rc)); |
| break; |
| |
| default: |
| snprintf(buf, sizeof(buf), "[%s], Reintegration error: %s\n", |
| method_name, ErrorMsg(rc)); |
| } |
| |
| mon_log_write(MON_CLUSTER_REINTEGRATE_1, SQ_LOG_ERR, buf); |
| |
| if ( abortIn ) |
| MPI_Abort(MPI_COMM_SELF,99); |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::SendReIntegrateStatus( STATE nodeState, int initErr ) |
| { |
| const char method_name[] = "CCluster::SendReIntegrateStatus"; |
| int rc; |
| nodeStatus_t nodeStatus; |
| nodeStatus.state = nodeState; |
| nodeStatus.status = initErr; |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| rc = Monitor->SendMPI( (char *) &nodeStatus |
| , sizeof(nodeStatus_t) |
| , 0 |
| , MON_XCHNG_DATA |
| , joinComm_ ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true ); |
| } |
| break; |
| case CommType_Sockets: |
| rc = Monitor->SendSock( (char *) &nodeStatus |
| , sizeof(nodeStatus_t) |
| , joinSock_ |
| , method_name ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err8, -1, NULL, true ); |
| } |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| |
| if ( nodeState != State_Up ) |
| { // Initialization error, abort. |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_9, MyPNID, initErr); |
| HandleReintegrateError( rc, initErr, -1, NULL, true ); |
| } |
| } |
| |
| bool CCluster::PingSockPeer(CNode *node) |
| { |
| const char method_name[] = "CCluster::PingSockPeer"; |
| TRACE_ENTRY; |
| |
| static int sv_connect_wait_timeout = -2; |
| static int sv_connect_retry_count = 1; |
| if ( sv_connect_wait_timeout == -2 ) |
| { |
| // Use the EPOLL timeout and retry values |
| char *lv_connect_wait_timeout_env = getenv( "SQ_MON_EPOLL_WAIT_TIMEOUT" ); |
| if ( lv_connect_wait_timeout_env ) |
| { |
| // Timeout in seconds |
| sv_connect_wait_timeout = atoi( lv_connect_wait_timeout_env ); |
| char *lv_connect_retry_count_env = getenv( "SQ_MON_EPOLL_RETRY_COUNT" ); |
| if ( lv_connect_retry_count_env ) |
| { |
| sv_connect_retry_count = atoi( lv_connect_retry_count_env ); |
| } |
| if ( sv_connect_retry_count > 180 ) |
| { |
| sv_connect_retry_count = 180; |
| } |
| } |
| else |
| { |
| // default to 64 seconds |
| sv_connect_wait_timeout = 16; |
| sv_connect_retry_count = 4; |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] Ping connect timeout wait_timeout=1 second, retry_count=%d\n" |
| , method_name |
| , __LINE__ |
| , (sv_connect_retry_count * sv_connect_wait_timeout) ); |
| |
| mon_log_write( MON_PINGSOCKPEER_3, SQ_LOG_INFO, buf ); |
| } |
| |
| bool rs = true; |
| int rc; |
| int pingSock = -1; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Pinging remote monitor %s, pnid=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() ); |
| } |
| |
| // Attempt to connect with remote monitor in one seconds increments |
| // to recover as quickly as possible or give up trying |
| for (int i = 0; i < (sv_connect_retry_count*sv_connect_wait_timeout); i++ ) |
| { |
| // Disable internal retries |
| pingSock = Monitor->Connect( node->GetCommPort(), false ); |
| if ( pingSock < 0 ) |
| { |
| if (node->GetState() != State_Up) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , node->GetPNid(), socks_[node->GetPNid()] ); |
| } |
| break; |
| } |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] Retrying connect to remote monitor %s, pnid=%d, retry=%d\n" |
| , method_name |
| , __LINE__ |
| , node->GetName(), node->GetPNid(), i ); |
| mon_log_write( MON_PINGSOCKPEER_4, SQ_LOG_INFO, buf ); |
| sleep( 1 ); |
| } |
| else |
| { |
| break; |
| } |
| } |
| if ( pingSock < 0 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Can't connect to remote monitor %s, pnid=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() ); |
| } |
| return(false); |
| } |
| |
| nodeId_t nodeInfo; |
| |
| nodeInfo.pnid = MyPNID; |
| strcpy(nodeInfo.nodeName, MyNode->GetName()); |
| strcpy(nodeInfo.commPort, MyNode->GetCommPort()); |
| strcpy(nodeInfo.syncPort, MyNode->GetSyncPort()); |
| nodeInfo.ping = true; |
| nodeInfo.creatorPNid = -1; |
| nodeInfo.creator = false; |
| nodeInfo.creatorShellPid = -1; |
| nodeInfo.creatorShellVerifier = -1; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "Sending my nodeInfo.pnid=%d\n" |
| " nodeInfo.nodeName=%s\n" |
| " nodeInfo.commPort=%s\n" |
| " nodeInfo.syncPort=%s\n" |
| " nodeInfo.creatorPNid=%d\n" |
| " nodeInfo.creator=%d\n" |
| " nodeInfo.creatorShellPid=%d\n" |
| " nodeInfo.creatorShellVerifier=%d\n" |
| " nodeInfo.ping=%d\n" |
| , nodeInfo.pnid |
| , nodeInfo.nodeName |
| , nodeInfo.commPort |
| , nodeInfo.syncPort |
| , nodeInfo.creatorPNid |
| , nodeInfo.creator |
| , nodeInfo.creatorShellPid |
| , nodeInfo.creatorShellVerifier |
| , nodeInfo.ping ); |
| } |
| |
| rc = Monitor->SendSock( (char *) &nodeInfo |
| , sizeof(nodeId_t) |
| , pingSock |
| , method_name ); |
| |
| if ( rc ) |
| { |
| rs = false; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Cannot send ping node info to node %s: (%s)\n" |
| , method_name, node->GetName(), ErrorMsg(rc)); |
| mon_log_write(MON_PINGSOCKPEER_1, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| // Get info about connecting monitor |
| rc = Monitor->ReceiveSock( (char *) &nodeInfo |
| , sizeof(nodeId_t) |
| , pingSock |
| , method_name ); |
| if ( rc ) |
| { // Handle error |
| rs = false; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Cannot receive ping node info from node %s: (%s)\n" |
| , method_name, node->GetName(), ErrorMsg(rc)); |
| mon_log_write(MON_PINGSOCKPEER_2, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "Received from nodeInfo.pnid=%d\n" |
| " nodeInfo.nodeName=%s\n" |
| " nodeInfo.commPort=%s\n" |
| " nodeInfo.syncPort=%s\n" |
| " nodeInfo.ping=%d\n" |
| , nodeInfo.pnid |
| , nodeInfo.nodeName |
| , nodeInfo.commPort |
| , nodeInfo.syncPort |
| , nodeInfo.ping ); |
| } |
| } |
| } |
| |
| close( pingSock ); |
| |
| TRACE_EXIT; |
| return( rs ); |
| } |
| |
| void CCluster::ReIntegrate( int initProblem ) |
| { |
| const char method_name[] = "CCluster::ReIntegrate"; |
| TRACE_ENTRY; |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| ReIntegrateMPI( initProblem ); |
| break; |
| case CommType_Sockets: |
| ReIntegrateSock( initProblem ); |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::ReIntegrateMPI( int initProblem ) |
| { |
| const char method_name[] = "CCluster::ReIntegrateMPI"; |
| TRACE_ENTRY; |
| |
| int rc; |
| bool haveCreatorComm = false; |
| MPI_Comm interComm; |
| MPI_Comm intraComm = MPI_COMM_NULL; |
| MPI_Comm intraCommCreatorMon = MPI_COMM_NULL; |
| |
| nodeId_t myNodeInfo; |
| strcpy(myNodeInfo.nodeName, MyNode->GetName()); |
| strcpy(myNodeInfo.commPort, MyNode->GetCommPort()); |
| // Set bit indicating my node is up |
| upNodes_.upNodes[MyPNID/MAX_NODE_BITMASK] |= (1ull << (MyPNID%MAX_NODE_BITMASK)); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Connect to creator monitor (port %s)\n", |
| method_name, __LINE__, IntegratingMonitorPort); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_1, MyPNID); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| TEST_POINT( TP010_NODE_UP ); |
| // Connect with my creator monitor |
| rc = MPI_Comm_connect( IntegratingMonitorPort, |
| MPI_INFO_NULL, 0, MPI_COMM_SELF, &joinComm_ ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err1, -1, NULL, true ); |
| } |
| |
| MPI_Comm_set_errhandler( joinComm_, MPI_ERRORS_RETURN ); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_4, MyPNID); |
| |
| TEST_POINT( TP011_NODE_UP ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Connected to creator monitor, sending id\n", |
| method_name, __LINE__); |
| } |
| |
| // Send this node's name and port number so creator monitor |
| // knows who we are, and set flag to let creator monitor it is the CREATOR. |
| myNodeInfo.creator = true; |
| myNodeInfo.creatorShellPid = CreatorShellPid; |
| myNodeInfo.creatorShellVerifier = CreatorShellVerifier; |
| if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0, |
| MON_XCHNG_DATA, joinComm_))) |
| HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL, |
| true ); |
| |
| TEST_POINT( TP012_NODE_UP ); |
| |
| // Merge the inter-communicators obtained from the connect/accept |
| // between this new monitor and the creator monitor. |
| if ((rc = MPI_Intercomm_merge( joinComm_, 1, &intraCommCreatorMon ))) |
| HandleReintegrateError( rc, Reintegrate_Err2, -1, NULL, true ); |
| |
| MPI_Comm_set_errhandler( intraCommCreatorMon, MPI_ERRORS_RETURN ); |
| |
| nodeId_t *nodeInfo = new nodeId_t[GetConfigPNodesCount()]; |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_3, MyPNID); |
| |
| // Obtain node names & port numbers of existing monitors from |
| // the creator monitor. |
| if ((rc = Monitor->ReceiveMPI((char *)nodeInfo, sizeof(nodeId_t)*GetConfigPNodesCount(), |
| MPI_ANY_SOURCE, MON_XCHNG_DATA, joinComm_))) |
| HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true ); |
| |
| if ( initProblem ) |
| { |
| // The monitor encountered an initialization error. Inform |
| // the creator monitor that the node is down. Then abort. |
| SendReIntegrateStatus( State_Down, initProblem ); |
| } |
| |
| // Connect to each of the other existing monitors and let them know |
| // we are the NEW monitor and reset the creator flag so they know they are |
| // not the creator monitor. |
| myNodeInfo.creator = false; |
| myNodeInfo.creatorShellPid = -1; |
| myNodeInfo.creatorShellVerifier = -1; |
| for (int i = 0; i < GetConfigPNodesCount(); i++) |
| { |
| if (strcmp(nodeInfo[i].commPort, IntegratingMonitorPort) == 0) |
| { // Already connected to creator monitor |
| comms_[i] = intraCommCreatorMon; |
| otherMonRank_[i] = 0; |
| ++currentNodes_; |
| |
| // Set bit indicating node is up |
| upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK)); |
| |
| Node[i]->SetCommPort( IntegratingMonitorPort ); |
| Node[i]->SetState( State_Up ); |
| haveCreatorComm = true; |
| } |
| else if (nodeInfo[i].nodeName[0] != 0 |
| && nodeInfo[i].commPort[0] != 0) |
| { |
| if ( haveCreatorComm && i >= GetConfigPNodesCount()/2) |
| // Reintegration failure after connecting to half |
| // of existing monitors. |
| TEST_POINT( TP016_NODE_UP ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Attempting connection to node %d (%s), " |
| "port %s\n", method_name, __LINE__, i, |
| nodeInfo[i].nodeName, nodeInfo[i].commPort); |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_5, MyPNID, i); |
| |
| TEST_POINT( TP013_NODE_UP ); |
| |
| // Connect to existing monitor |
| if ((rc = MPI_Comm_connect( nodeInfo[i].commPort, |
| MPI_INFO_NULL, 0, MPI_COMM_SELF, |
| &interComm ))) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| MPI_Comm_set_errhandler( interComm, MPI_ERRORS_RETURN ); |
| |
| TEST_POINT( TP014_NODE_UP ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Connected to node %d (%s), sending id\n", |
| method_name, __LINE__,i,nodeInfo[i].nodeName); |
| } |
| |
| // Send this nodes name and port number so other monitor |
| // knows who we are. |
| if ((rc = Monitor->SendMPI((char *) &myNodeInfo, sizeof(nodeId_t), 0, |
| MON_XCHNG_DATA, interComm))) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| if ((rc = MPI_Intercomm_merge(interComm, 1, &intraComm))) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err6, i, NULL, false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| // Get acknowledgement that other monitor is ready to |
| // integrate this node. This is an interlock to avoid a |
| // race condition where the creator monitor could signal |
| // the monitors in the cluster to integrate the new node |
| // before one or more was ready to do the integration. |
| int readyFlag; |
| if ((rc = Monitor->ReceiveMPI((char *) &readyFlag, sizeof(readyFlag), |
| MPI_ANY_SOURCE, MON_XCHNG_DATA, |
| interComm))) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err15, i, NULL, |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Received ready-flag from node %d (%s)\n", |
| method_name, __LINE__, i, |
| nodeInfo[i].nodeName); |
| } |
| |
| if ((rc = MPI_Comm_disconnect(&interComm))) |
| HandleReintegrateError( rc, Reintegrate_Err7, i, NULL, false ); |
| |
| MPI_Comm_set_errhandler(intraComm, MPI_ERRORS_RETURN); |
| |
| comms_[i] = intraComm; |
| otherMonRank_[i] = 0; |
| ++currentNodes_; |
| Node[i]->SetSyncPort( nodeInfo[i].syncPort ); |
| Node[i]->SetState( State_Up ); |
| |
| // Set bit indicating node is up |
| upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK)); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_6, MyPNID, i); |
| } |
| else if ( i != MyPNID) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Connection to node %d not attempted, " |
| "no port information. nodeInfo[%d].port=%s, " |
| "IntegratingMonitorPort=%s\n", method_name, |
| __LINE__, i, i, nodeInfo[i].commPort, |
| IntegratingMonitorPort); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_7, MyPNID); |
| |
| TEST_POINT( TP015_NODE_UP ); |
| |
| // Inform creator monitor that connections are complete and |
| // this monitor is ready to participate in "allgather" |
| // communications with the other monitors. |
| SendReIntegrateStatus( State_Up, 0 ); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_8, MyPNID); |
| |
| MyNode->SetState( State_Merged ); |
| |
| delete[] nodeInfo; |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::ReIntegrateSock( int initProblem ) |
| { |
| const char method_name[] = "CCluster::ReIntegrateSock"; |
| TRACE_ENTRY; |
| |
| bool haveCreatorSocket = false; |
| int rc; |
| int existingCommFd; |
| int existingSyncFd; |
| char commPort[MPI_MAX_PORT_NAME]; |
| char syncPort[MPI_MAX_PORT_NAME]; |
| char *pch1; |
| char *pch2; |
| |
| // Set bit indicating my node is up |
| upNodes_.upNodes[MyPNID/MAX_NODE_BITMASK] |= (1ull << (MyPNID%MAX_NODE_BITMASK)); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Connect to creator monitor (port %s)\n", |
| method_name, __LINE__, IntegratingMonitorPort); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_1, MyPNID); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| TEST_POINT( TP010_NODE_UP ); |
| |
| // Connect with my creator monitor |
| bool lv_done = false; |
| bool lv_did_not_connect_in_first_attempt = false; |
| while ( ! lv_done ) |
| { |
| joinSock_ = Monitor->Connect( IntegratingMonitorPort ); |
| if ( joinSock_ < 0 ) |
| { |
| if ( IsAgentMode ) |
| { |
| lv_did_not_connect_in_first_attempt = true; |
| sleep( 15 ); |
| } |
| else |
| { |
| HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL, true ); |
| } |
| } |
| else |
| { |
| if ( lv_did_not_connect_in_first_attempt ) |
| { |
| sleep( 10 ); |
| } |
| lv_done = true; |
| } |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_4, MyPNID); |
| |
| TEST_POINT( TP011_NODE_UP ); |
| |
| // Send this node's name and port number so creator monitor |
| // knows who we are, and set flag to let creator monitor it is the CREATOR. |
| nodeId_t myNodeInfo; |
| strcpy(myNodeInfo.nodeName, MyNode->GetName()); |
| strcpy(myNodeInfo.commPort, MyNode->GetCommPort()); |
| strcpy(myNodeInfo.syncPort, MyNode->GetSyncPort()); |
| myNodeInfo.pnid = MyNode->GetPNid(); |
| myNodeInfo.creatorPNid = -1; |
| myNodeInfo.creator = true; |
| myNodeInfo.creatorShellPid = CreatorShellPid; |
| myNodeInfo.creatorShellVerifier = CreatorShellVerifier; |
| myNodeInfo.ping = false; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connected to creator monitor, sending my info:\n" |
| " myNodeInfo.pnid=%d (%s)\n" |
| " myNodeInfo.commPort=%s\n" |
| " myNodeInfo.syncPort=%s\n" |
| " myNodeInfo.creator=%d\n" |
| " myNodeInfo.creatorShellPid=%d:%d\n" |
| " myNodeInfo.ping=%d\n" |
| , method_name, __LINE__ |
| , myNodeInfo.pnid |
| , myNodeInfo.nodeName |
| , myNodeInfo.commPort |
| , myNodeInfo.syncPort |
| , myNodeInfo.creator |
| , myNodeInfo.creatorShellPid |
| , myNodeInfo.creatorShellVerifier |
| , myNodeInfo.ping ); |
| } |
| |
| rc = Monitor->SendSock( (char *) &myNodeInfo |
| , sizeof(nodeId_t) |
| , joinSock_ |
| , method_name ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err9, -1, NULL, true ); |
| } |
| |
| TEST_POINT( TP012_NODE_UP ); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_3, MyPNID); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Getting all node info from creator monitor\n", |
| method_name, __LINE__); |
| } |
| |
| // Obtain node names & port numbers of existing monitors from |
| // the creator monitor. |
| int pnodeCount = Nodes->GetPNodesCount(); |
| nodeId_t *nodeInfo; |
| size_t nodeInfoSize = (sizeof(nodeId_t) * pnodeCount); |
| nodeInfo = (nodeId_t *) new char[nodeInfoSize]; |
| rc = Monitor->ReceiveSock( (char *)nodeInfo |
| , nodeInfoSize |
| , joinSock_ |
| , method_name ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err3, -1, NULL, true ); |
| } |
| |
| if ( initProblem ) |
| { |
| // The monitor encountered an initialization error. Inform |
| // the creator monitor that the node is down. Then abort. |
| SendReIntegrateStatus( State_Down, initProblem ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Received port info from creator monitor\n" |
| , method_name, __LINE__); |
| for (int i=0; i<pnodeCount; i++) |
| { |
| trace_printf( "Port info for pnid=%d\n" |
| " nodeInfo[%d].nodeName=%s\n" |
| " nodeInfo[%d].commPort=%s\n" |
| " nodeInfo[%d].syncPort=%s\n" |
| " nodeInfo[%d].creatorPNid=%d\n" |
| , nodeInfo[i].pnid |
| , i, nodeInfo[i].nodeName |
| , i, nodeInfo[i].commPort |
| , i, nodeInfo[i].syncPort |
| , i, nodeInfo[i].creatorPNid ); |
| } |
| } |
| // Connect to each of the other existing monitors and let them know |
| // we are the NEW monitor and reset the creator flag so they know they are |
| // not the creator monitor. |
| myNodeInfo.creator = false; |
| myNodeInfo.creatorShellPid = -1; |
| myNodeInfo.creatorShellVerifier = -1; |
| myNodeInfo.ping = false; |
| for (int i=0; i<pnodeCount; i++) |
| { |
| if ( nodeInfo[i].creatorPNid != -1 && |
| nodeInfo[i].creatorPNid == nodeInfo[i].pnid ) |
| { |
| // Get acknowledgement that creator monitor is ready to |
| // integrate this node. |
| int creatorpnid = -1; |
| rc = Monitor->ReceiveSock( (char *) &creatorpnid |
| , sizeof(creatorpnid) |
| , joinSock_ |
| , method_name ); |
| if ( rc || creatorpnid != nodeInfo[i].creatorPNid ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err15, i, NULL, |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Received ready indication from creator " |
| "node %d nodeInfo[%d].nodeName=%s\n" |
| , method_name, __LINE__ |
| , creatorpnid, i , nodeInfo[i].nodeName); |
| } |
| |
| otherMonRank_[nodeInfo[i].pnid] = 0; |
| ++currentNodes_; |
| |
| // Store port numbers for the node |
| strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME); |
| strncpy(syncPort, nodeInfo[i].syncPort, MPI_MAX_PORT_NAME); |
| |
| Node[nodeInfo[i].pnid]->SetCommPort( commPort ); |
| pch1 = strtok (commPort,":"); |
| pch1 = strtok (NULL,":"); |
| Node[nodeInfo[i].pnid]->SetCommSocketPort( atoi(pch1) ); |
| Node[nodeInfo[i].pnid]->SetSyncPort( syncPort ); |
| pch2 = strtok (syncPort,":"); |
| pch2 = strtok (NULL,":"); |
| Node[nodeInfo[i].pnid]->SetSyncSocketPort( atoi(pch2) ); |
| sockPorts_[nodeInfo[i].pnid] = Node[nodeInfo[i].pnid]->GetSyncSocketPort(); |
| |
| Node[nodeInfo[i].pnid]->SetState( State_Up ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n" |
| , method_name, __LINE__ |
| , Node[nodeInfo[i].pnid]->GetPNid() |
| , Node[nodeInfo[i].pnid]->GetName() |
| , pch1, atoi(pch1) |
| , pch2, atoi(pch2) ); |
| } |
| |
| // Tell creator we are ready to accept its connection |
| int mypnid = MyPNID; |
| rc = Monitor->SendSock( (char *) &mypnid |
| , sizeof(mypnid) |
| , joinSock_ |
| , method_name ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| // Connect to creator monitor |
| existingSyncFd = AcceptSyncSock(); |
| if ( existingSyncFd < 0 ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock |
| // Set bit indicating node is up |
| upNodes_.upNodes[nodeInfo[i].pnid/MAX_NODE_BITMASK] |= |
| (1ull << (nodeInfo[i].pnid%MAX_NODE_BITMASK)); |
| |
| if (trace_settings & (TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf( "%s@%d Connected to creator node %d (%s)\n" |
| , method_name, __LINE__ |
| , nodeInfo[i].creatorPNid |
| , nodeInfo[i].nodeName ); |
| trace_printf( "%s@%d socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]); |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| haveCreatorSocket = true; |
| } |
| else if ( nodeInfo[i].nodeName[0] != 0 && nodeInfo[i].commPort[0] != 0 ) |
| { |
| if ( haveCreatorSocket && i >= pnodeCount/2) |
| // Reintegration failure after connecting to half |
| // of existing monitors. |
| TEST_POINT( TP016_NODE_UP ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Attempting connection to node %d (%s), " |
| "port %s\n", method_name, __LINE__, nodeInfo[i].pnid, |
| nodeInfo[i].nodeName, nodeInfo[i].commPort); |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_5, MyPNID, i); |
| |
| TEST_POINT( TP013_NODE_UP ); |
| |
| // Connect to existing monitor |
| existingCommFd = Monitor->Connect( nodeInfo[i].commPort ); |
| if ( existingCommFd < 0 ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| TEST_POINT( TP014_NODE_UP ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf("%s@%d Connected to node %d (%s), sending my node name\n", |
| method_name, __LINE__,i,nodeInfo[i].nodeName); |
| } |
| |
| // Send this nodes name and port number so other monitor |
| // knows who we are. |
| rc = Monitor->SendSock( (char *) &myNodeInfo |
| , sizeof(nodeId_t) |
| , existingCommFd |
| , method_name ); |
| if ( rc ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err4, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| // Get acknowledgement that other monitor is ready to |
| // integrate this node. This is an interlock to avoid a |
| // race condition where the creator monitor could signal |
| // the monitors in the cluster to integrate the new node |
| // before one or more was ready to do the integration. |
| int remotepnid = -1; |
| rc = Monitor->ReceiveSock( (char *) &remotepnid |
| , sizeof(remotepnid) |
| , existingCommFd |
| , method_name ); |
| if ( rc || remotepnid != nodeInfo[i].pnid ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err15, i, NULL, |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Received ready indication from " |
| "node %d nodeInfo[%d].nodeName=%s\n" |
| , method_name, __LINE__ |
| , remotepnid, i , nodeInfo[i].nodeName); |
| } |
| |
| otherMonRank_[nodeInfo[i].pnid] = 0; |
| ++currentNodes_; |
| |
| // Store port numbers for the node |
| strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME); |
| strncpy(syncPort, nodeInfo[i].syncPort, MPI_MAX_PORT_NAME); |
| |
| Node[nodeInfo[i].pnid]->SetCommPort( commPort ); |
| pch1 = strtok (commPort,":"); |
| pch1 = strtok (NULL,":"); |
| Node[nodeInfo[i].pnid]->SetCommSocketPort( atoi(pch1) ); |
| |
| Node[nodeInfo[i].pnid]->SetSyncPort( syncPort ); |
| pch2 = strtok (syncPort,":"); |
| pch2 = strtok (NULL,":"); |
| Node[nodeInfo[i].pnid]->SetSyncSocketPort( atoi(pch2) ); |
| sockPorts_[nodeInfo[i].pnid] = Node[nodeInfo[i].pnid]->GetSyncSocketPort(); |
| |
| Node[nodeInfo[i].pnid]->SetState( State_Up ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n" |
| , method_name, __LINE__ |
| , Node[nodeInfo[i].pnid]->GetPNid() |
| , Node[nodeInfo[i].pnid]->GetName() |
| , pch1, atoi(pch1) |
| , pch2, atoi(pch2) ); |
| } |
| |
| // Connect to existing monitor |
| existingSyncFd = AcceptSyncSock(); |
| if ( existingSyncFd < 0 ) |
| { |
| HandleReintegrateError( rc, Reintegrate_Err5, i, &nodeInfo[i], |
| false ); |
| SendReIntegrateStatus( State_Down, Reintegrate_Err14 ); |
| } |
| socks_[nodeInfo[i].pnid] = existingSyncFd; // ReIntegrateSock |
| |
| // Set bit indicating node is up |
| upNodes_.upNodes[nodeInfo[i].pnid/MAX_NODE_BITMASK] |= |
| (1ull << (nodeInfo[i].pnid%MAX_NODE_BITMASK)); |
| |
| if (trace_settings & (TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf( "%s@%d socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]); |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_6, MyPNID, i); |
| } |
| else if ( nodeInfo[i].pnid != MyPNID) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d Connection to node %d not attempted, " |
| "since it's unavailable port information.\n" |
| "nodeInfo[%d].commPort=%s\n" |
| "nodeInfo[%d].syncPort=%s\n" |
| "IntegratingMonitorPort=%s\n" |
| , method_name, __LINE__ |
| , nodeInfo[i].pnid |
| , i, nodeInfo[i].commPort |
| , i, nodeInfo[i].syncPort |
| , IntegratingMonitorPort); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_RECOVERY | TRACE_INIT)) |
| { |
| for (int i=0; i<pnodeCount; i++) |
| { |
| if (nodeInfo[i].pnid == -1) continue; |
| if (Node[nodeInfo[i].pnid] == NULL) continue; |
| trace_printf( "%s@%d - Node info for pnid=%d (%s)\n" |
| " Node[%d] commPort=%s\n" |
| " Node[%d] syncPort=%s\n" |
| " Node[%d] creatorPNid=%d\n" |
| , method_name, __LINE__ |
| , Node[nodeInfo[i].pnid]->GetPNid() |
| , Node[nodeInfo[i].pnid]->GetName() |
| , nodeInfo[i].pnid, Node[nodeInfo[i].pnid]->GetCommPort() |
| , nodeInfo[i].pnid, Node[nodeInfo[i].pnid]->GetSyncPort() |
| , nodeInfo[i].pnid, nodeInfo[i].creatorPNid); |
| } |
| for ( int i =0; i < pnodeCount; i++ ) |
| { |
| if (nodeInfo[i].pnid == -1) continue; |
| trace_printf( "%s@%d socks_[%d]=%d, sockPorts_[%d]=%d\n" |
| , method_name, __LINE__ |
| , nodeInfo[i].pnid, socks_[nodeInfo[i].pnid] |
| , nodeInfo[i].pnid, sockPorts_[nodeInfo[i].pnid]); |
| } |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| trace_printf( "%s@%d Integrating node %s (pnid=%d) " |
| "sees set[%d]: %llx\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , i, upNodes_.upNodes[i] ); |
| } |
| } |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_7, MyPNID); |
| |
| TEST_POINT( TP015_NODE_UP ); |
| |
| // Inform creator monitor that connections are complete and |
| // this monitor is ready to participate in "allgather" |
| // communications with the other monitors. |
| SendReIntegrateStatus( State_Up, 0 ); |
| |
| mem_log_write(CMonLog::MON_REINTEGRATE_8, MyPNID); |
| |
| MyNode->SetState( State_Merged ); |
| |
| delete[] nodeInfo; |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::ResetIntegratingPNid( void ) |
| { |
| const char method_name[] = "CCluster::ResetIntegratingPNid"; |
| TRACE_ENTRY; |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| if ( joinComm_ != MPI_COMM_NULL ) |
| { |
| MPI_Comm_free( &joinComm_ ); |
| joinComm_ = MPI_COMM_NULL; |
| } |
| break; |
| case CommType_Sockets: |
| if ( joinSock_ != -1 ) |
| { |
| close(joinSock_); |
| joinSock_ = -1; |
| } |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| |
| if ( MyNode->IsCreator() ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Resetting creator pnid=%d\n", |
| method_name, __LINE__, MyPNID ); |
| } |
| |
| MyNode->SetCreator( false, -1, -1 ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Resetting integratingPNid_=%d\n", |
| method_name, __LINE__, integratingPNid_ ); |
| } |
| |
| integratingPNid_ = -1; |
| |
| #ifdef NAMESERVER_PROCESS |
| if (!CommAcceptMon.isAccepting()) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Triggering commAcceptorMon thread to begin accepting connections\n", |
| method_name, __LINE__ ); |
| } |
| |
| // Indicate to the commAcceptor thread to begin accepting connections |
| CommAcceptMon.startAccepting(); |
| } |
| #endif |
| |
| if (!CommAccept.isAccepting()) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Triggering commAcceptor thread to begin accepting connections\n", |
| method_name, __LINE__ ); |
| } |
| |
| // Indicate to the commAcceptor thread to begin accepting connections |
| CommAccept.startAccepting(); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::SetIntegratingPNid( int pnid ) |
| { |
| const char method_name[] = "CCluster::SetIntegratingPNid"; |
| TRACE_ENTRY; |
| |
| integratingPNid_ = pnid; |
| |
| TRACE_EXIT; |
| } |
| |
| // Save information about a new communicator for a node that is reintegrating |
| void CCluster::addNewComm(int pnid, int otherRank, MPI_Comm comm) |
| { |
| const char method_name[] = "CCluster::addNewComm"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d - saving communicator for pnid %d\n", |
| method_name, __LINE__, pnid); |
| } |
| |
| // Insert info for new comm into list |
| commInfo_t commInfo = {pnid, otherRank, comm, -1, {0, 0}}; |
| clock_gettime(CLOCK_REALTIME, &commInfo.ts); |
| |
| newCommsLock_.lock(); |
| newComms_.push_back( commInfo ); |
| newCommsLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| // A node is reintegrating. Add the communicator for the node to the set of |
| // communicators used by "Allgather". |
| void CCluster::setNewComm( int pnid ) |
| { |
| const char method_name[] = "CCluster::setNewComm"; |
| TRACE_ENTRY; |
| |
| newComms_t::iterator it; |
| bool foundComm = false; |
| |
| if ( comms_[pnid] != MPI_COMM_NULL ) |
| { // Unexpectedly already have a communicator for this node |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Unexpectedly already have a " |
| "communicator for node %d\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_SETNEWCOMM_1, SQ_LOG_ERR, buf); |
| |
| MPI_Comm_free( &comms_[pnid] ); |
| if ( CommType == CommType_Sockets ) |
| { |
| shutdown( socks_[pnid], SHUT_RDWR); |
| close( socks_[pnid] ); |
| socks_[pnid] = -1; |
| } |
| } |
| |
| newCommsLock_.lock(); |
| for ( it = newComms_.begin(); it != newComms_.end(); ) |
| { |
| if ( it->pnid == pnid ) |
| { |
| if ( comms_[pnid] != MPI_COMM_NULL ) |
| { // Found another communicator for the specified node. |
| // Disconnect from the previous one. It must be a |
| // stale leftover from a previous reintegration |
| // attempt for the node. |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d - discarding stale communicator for " |
| "pnid %d\n", method_name, __LINE__, pnid); |
| } |
| |
| MPI_Comm_free( &comms_[pnid] ); |
| if ( CommType == CommType_Sockets ) |
| { |
| shutdown( socks_[pnid], SHUT_RDWR); |
| close( socks_[pnid] ); |
| socks_[pnid] = -1; |
| } |
| --currentNodes_; |
| } |
| |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d - setting new communicator for pnid %d, " |
| "otherRank=%d\n", |
| method_name, __LINE__, it->pnid, it->otherRank); |
| } |
| |
| comms_[it->pnid] = it->comm; |
| otherMonRank_[it->pnid] = it->otherRank; |
| ++currentNodes_; |
| // Set bit indicating node is up |
| upNodes_.upNodes[it->pnid/MAX_NODE_BITMASK] |= (1ull << (it->pnid%MAX_NODE_BITMASK)); |
| |
| // Delete current list element and advance to next one |
| it = newComms_.erase ( it ); |
| |
| foundComm = true; |
| } |
| else |
| { // Advance to next list element |
| ++it; |
| } |
| } |
| newCommsLock_.unlock(); |
| |
| if ( !foundComm ) |
| { // We have no communicator for the specified node. |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Could not find a communicator for " |
| "node %d\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_SETNEWCOMM_2, SQ_LOG_ERR, buf); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| // Save information about a new socket for a node that is reintegrating |
| void CCluster::addNewSock(int pnid, int otherRank, int sockFd) |
| { |
| const char method_name[] = "CCluster::addNewSock"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d - saving socket for pnid %d\n", |
| method_name, __LINE__, pnid); |
| } |
| |
| // Insert info for new comm into list |
| commInfo_t commInfo = {pnid, otherRank, MPI_COMM_NULL, sockFd, {0, 0}}; |
| clock_gettime(CLOCK_REALTIME, &commInfo.ts); |
| |
| newCommsLock_.lock(); |
| newComms_.push_back( commInfo ); |
| newCommsLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| // A node is reintegrating. Add the socket for the node to the set of |
| // communicators used by "Allgather". |
| void CCluster::setNewSock( int pnid ) |
| { |
| const char method_name[] = "CCluster::setNewSock"; |
| TRACE_ENTRY; |
| |
| newComms_t::iterator it; |
| bool foundSocket = false; |
| |
| if ( socks_[pnid] != -1 ) |
| { // Unexpectedly already have a communicator for this node |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Unexpectedly already have a " |
| "socket for node %d\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_SETNEWSOCK_1, SQ_LOG_ERR, buf); |
| |
| shutdown( socks_[pnid], SHUT_RDWR); |
| close( socks_[pnid] ); |
| socks_[pnid] = -1; |
| } |
| |
| newCommsLock_.lock(); |
| for ( it = newComms_.begin(); it != newComms_.end(); ) |
| { |
| if ( it->pnid == pnid ) |
| { |
| if ( socks_[pnid] != -1 ) |
| { // Found another socket for the specified node. |
| // Disconnect from the previous one. It must be a |
| // stale leftover from a previous reintegration |
| // attempt for the node. |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf("%s@%d - discarding stale communicator for " |
| "pnid %d\n", method_name, __LINE__, pnid); |
| } |
| |
| shutdown( socks_[pnid], SHUT_RDWR); |
| close( socks_[pnid] ); |
| socks_[pnid] = -1; |
| --currentNodes_; |
| } |
| |
| CNode *node = Nodes->GetNode( it->pnid ); |
| socks_[it->pnid] = it->socket; // setNewSock |
| sockPorts_[it->pnid] = node->GetSyncSocketPort(); |
| otherMonRank_[it->pnid] = it->otherRank; |
| ++currentNodes_; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting new communicator for %d (%s), " |
| "socks_[%d]=%d, sockPorts_[%d]=%d, otherMonRank_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetPNid() |
| , node->GetName() |
| , it->pnid, socks_[it->pnid] |
| , it->pnid, sockPorts_[it->pnid] |
| , it->pnid, otherMonRank_[it->pnid] ); |
| } |
| |
| // Set bit indicating node is up |
| upNodes_.upNodes[it->pnid/MAX_NODE_BITMASK] |= (1ull << (it->pnid%MAX_NODE_BITMASK)); |
| |
| // Delete current list element and advance to next one |
| it = newComms_.erase ( it ); |
| |
| foundSocket = true; |
| } |
| else |
| { // Advance to next list element |
| ++it; |
| } |
| } |
| newCommsLock_.unlock(); |
| |
| if ( !foundSocket ) |
| { // We have no communicator for the specified node. |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Could not find a socket for " |
| "node %d\n", method_name, pnid); |
| mon_log_write(MON_CLUSTER_SETNEWSOCK_2, SQ_LOG_ERR, buf); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| int CCluster::Allgather( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats ) |
| { |
| const char method_name[] = "CCluster::Allgather"; |
| TRACE_ENTRY; |
| |
| int err = 0; |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| err = AllgatherIB( nbytes, sbuf, rbuf, tag, stats ); |
| break; |
| case CommType_Sockets: |
| err = AllgatherSock( nbytes, sbuf, rbuf, tag, stats ); |
| break; |
| default: |
| // Programmer bonehead! |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| |
| TRACE_EXIT; |
| return err; |
| } |
| |
| int CCluster::AllgatherIB( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats ) |
| { |
| const char method_name[] = "CCluster::AllgatherIB"; |
| TRACE_ENTRY; |
| |
| int e; |
| int err = 0; |
| |
| MPI_Request r[2*GetConfigPNodesCount()]; |
| MPI_Status s[2*GetConfigPNodesCount()]; |
| for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ ) |
| { |
| s[i].MPI_ERROR = MPI_SUCCESS; |
| r[i] = MPI_REQUEST_NULL; |
| } |
| |
| char *cp = rbuf; |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| if ( comms_[i] != MPI_COMM_NULL && otherMonRank_[i] != -1 ) |
| { |
| e = MPI_Send_init( sbuf, nbytes, MPI_CHAR, otherMonRank_[i], tag, |
| comms_[i], &r[i] ); |
| if ( e != MPI_SUCCESS ) |
| { |
| MPI_Error_class( e, &err ); |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Comunication error with pnid=%d (%s), " |
| "MPI_Send_init() error=%s (%d)\n" |
| , method_name, i, Node[i]->GetName() |
| , ErrorMsg(e), e ); |
| mon_log_write(MON_CLUSTER_ALLGATHERIB_1, SQ_LOG_ERR, buf); |
| goto early_exit; |
| } |
| |
| e = MPI_Recv_init( cp, CommBufSize, MPI_CHAR, otherMonRank_[i], tag, |
| comms_[i], &r[i+GetConfigPNodesCount()] ); |
| if ( e != MPI_SUCCESS ) |
| { |
| MPI_Error_class( e, &err ); |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Comunication error with pnid=%d (%s), " |
| "MPI_Recv_init() error=%s (%d)\n" |
| , method_name, i, Node[i]->GetName() |
| , ErrorMsg(e), e ); |
| mon_log_write(MON_CLUSTER_ALLGATHERIB_2, SQ_LOG_ERR, buf); |
| goto early_exit; |
| } |
| } |
| cp += CommBufSize; |
| } |
| for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ ) |
| { |
| if ( r[i] == MPI_REQUEST_NULL ) continue; |
| e = MPI_Start( &r[i] ); |
| if ( e != MPI_SUCCESS ) |
| { |
| MPI_Error_class( e, &err ); |
| char buf[MON_STRING_BUF_SIZE]; |
| int pnid = (i < GetConfigPNodesCount()) ? i : (i - GetConfigPNodesCount()); |
| snprintf( buf, sizeof(buf) |
| , "[%s], Comunication error with pnid=%d (%s), " |
| "MPI_Start() error=%s (%d)\n" |
| , method_name, pnid, Node[pnid]->GetName() |
| , ErrorMsg(e), e ); |
| mon_log_write(MON_CLUSTER_ALLGATHERIB_3, SQ_LOG_ERR, buf); |
| goto early_exit; |
| } |
| } |
| |
| inBarrier_ = true; |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->BarrierWaitIncr(); |
| |
| e = MPI_Waitall( GetConfigPNodesCount()*2, r, s ); |
| if ( e != MPI_SUCCESS ) |
| { |
| MPI_Error_class( e, &err ); |
| if ( err != MPI_ERR_IN_STATUS ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s], MPI_Waitall() error=%s (%d)\n" |
| , method_name, ErrorMsg(e), e ); |
| mon_log_write(MON_CLUSTER_ALLGATHERIB_4, SQ_LOG_ERR, buf); |
| inBarrier_ = false; |
| goto early_exit; |
| } |
| } |
| |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->BarrierWaitDecr(); |
| inBarrier_ = false; |
| |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| stats[i] = s[i+GetConfigPNodesCount()]; |
| } |
| if ( e == MPI_SUCCESS ) |
| { |
| err = MPI_SUCCESS; |
| goto early_exit; |
| } |
| |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| if ( s[i].MPI_ERROR != MPI_SUCCESS && // send |
| s[i+GetConfigPNodesCount()].MPI_ERROR == MPI_SUCCESS ) // receive |
| { |
| stats[i].MPI_ERROR = s[i].MPI_ERROR; |
| } |
| } |
| |
| early_exit: |
| |
| for ( int i = 0; i < 2*GetConfigPNodesCount(); i++ ) |
| { |
| if ( r[i] != MPI_REQUEST_NULL ) |
| { |
| MPI_Request_free( &r[i] ); |
| } |
| } |
| |
| barrierCount_++; |
| |
| TRACE_EXIT; |
| return err; |
| } |
| |
| int CCluster::AllgatherSock( int nbytes, void *sbuf, char *rbuf, int tag, MPI_Status *stats ) |
| { |
| const char method_name[] = "CCluster::AllgatherSock"; |
| TRACE_ENTRY; |
| |
| bool reconnecting = false; |
| static int hdrSize = Nodes->GetSyncHdrSize( ); |
| int err = MPI_SUCCESS; |
| peer_t p[GetConfigPNodesMax()]; |
| memset( p, 0, sizeof(p) ); |
| tag = tag; // make compiler happy |
| // Set to twice the ZClient session timeout |
| static int sessionTimeout = ZClientEnabled |
| ? (ZClient->GetSessionTimeout() * 2) : 120; |
| |
| int nsent = 0, nrecv = 0; |
| for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ ) |
| { |
| peer_t *peer = &p[indexToPnid_[iPeer]]; |
| stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_SUCCESS; |
| stats[indexToPnid_[iPeer]].count = 0; |
| if ( indexToPnid_[iPeer] == MyPNID || socks_[indexToPnid_[iPeer]] == -1 ) |
| { |
| peer->p_sending = peer->p_receiving = false; |
| nsent++; |
| nrecv++; |
| } |
| else |
| { |
| peer->p_sending = peer->p_receiving = true; |
| peer->p_sent = peer->p_received = 0; |
| peer->p_timeout_count = 0; |
| peer->p_initial_check = true; |
| peer->p_n2recv = -1; |
| peer->p_buff = ((char *) rbuf) + (indexToPnid_[iPeer] * CommBufSize); |
| |
| struct epoll_event event; |
| event.data.fd = socks_[indexToPnid_[iPeer]]; |
| event.events = EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; |
| EpollCtl( epollFD_, EPOLL_CTL_ADD, socks_[indexToPnid_[iPeer]], &event ); |
| } |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_SYNC_DETAIL)) |
| { |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| peer_t *peer = &p[indexToPnid_[i]]; |
| trace_printf( "%s@%d" " - socks_[%d]=%d, " |
| "peer->p_sending=%d, " |
| "peer->p_receiving=%d\n" |
| , method_name, __LINE__ |
| , indexToPnid_[i] |
| , socks_[indexToPnid_[i]] |
| , peer->p_sending |
| , peer->p_receiving ); |
| } |
| } |
| } |
| |
| inBarrier_ = true; |
| MonStats->BarrierWaitIncr( ); |
| |
| static int sv_epoll_wait_timeout = -2; |
| static int sv_epoll_retry_count = 1; |
| if ( sv_epoll_wait_timeout == -2 ) |
| { |
| char *lv_epoll_wait_timeout_env = getenv( "SQ_MON_EPOLL_WAIT_TIMEOUT" ); |
| if ( lv_epoll_wait_timeout_env ) |
| { |
| // convert to milliseconds |
| sv_epoll_wait_timeout = atoi( lv_epoll_wait_timeout_env ) * 1000; |
| char *lv_epoll_retry_count_env = getenv( "SQ_MON_EPOLL_RETRY_COUNT" ); |
| if ( lv_epoll_retry_count_env ) |
| { |
| sv_epoll_retry_count = atoi( lv_epoll_retry_count_env ); |
| } |
| if ( sv_epoll_retry_count > 180 ) |
| { |
| sv_epoll_retry_count = 180; |
| } |
| } |
| else |
| { |
| // default to 64 seconds |
| sv_epoll_wait_timeout = 16000; |
| sv_epoll_retry_count = 4; |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] EPOLL timeout wait_timeout=%d msecs, retry_count=%d\n" |
| , method_name |
| , __LINE__ |
| , sv_epoll_wait_timeout |
| , sv_epoll_retry_count ); |
| |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_1, SQ_LOG_INFO, buf ); |
| } |
| |
| // do the work |
| struct epoll_event events[2*GetConfigPNodesMax() + 1]; |
| while ( 1 ) |
| { |
| reconnected: |
| bool checkConnections = false; |
| bool doReconnect = false; |
| bool resetConnections = false; |
| int peerTimedoutCount = 0; |
| int maxEvents = 2*GetConfigPNodesCount() - nsent - nrecv; |
| if ( maxEvents == 0 ) break; |
| int nw; |
| peer_t *peer; |
| |
| while ( 1 ) |
| { |
| nw = epoll_wait( epollFD_, events, maxEvents, sv_epoll_wait_timeout ); |
| if ( nw >= 0 || errno != EINTR ) break; |
| } |
| |
| if ( nw == 0 ) |
| { // Timeout, no fd's ready |
| for ( int iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ ) |
| { // Check no IO completion on peers |
| peer = &p[indexToPnid_[iPeer]]; |
| if ( (peer->p_receiving) || (peer->p_sending) ) |
| { |
| peerTimedoutCount++; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - EPOLL timeout (%d) on: %s(%d), " |
| "socks_[%d]=%d, " |
| "peer->p_sending=%d, " |
| "peer->p_receiving=%d\n" |
| , method_name, __LINE__ |
| , peerTimedoutCount |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , indexToPnid_[iPeer] |
| , socks_[indexToPnid_[iPeer]] |
| , peer->p_sending |
| , peer->p_receiving ); |
| } |
| |
| if (peer->p_initial_check && !reconnecting) |
| { // Set the session timeout relative to now |
| peer->p_initial_check = false; |
| clock_gettime(CLOCK_REALTIME, &peer->znodeFailedTime); |
| peer->znodeFailedTime.tv_sec += sessionTimeout; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d" " - Znode Fail Time %ld(secs)\n" |
| , method_name, __LINE__ |
| , peer->znodeFailedTime.tv_sec); |
| } |
| } |
| |
| if ( IsRealCluster && peer->p_timeout_count < sv_epoll_retry_count ) |
| { |
| peer->p_timeout_count++; |
| checkConnections = true; |
| if (peer->p_timeout_count == sv_epoll_retry_count) |
| { |
| resetConnections = true; |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d" " - Peer timed out: %s(%d), " |
| "socks_[%d]=%d, " |
| "peer->p_timeout_count=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , indexToPnid_[iPeer] |
| , socks_[indexToPnid_[iPeer]] |
| , peer->p_timeout_count ); |
| } |
| } |
| } |
| } // Check no IO completion on peers |
| |
| if (checkConnections) |
| { |
| checkConnections = false; |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf( "%s@%d - Initializing AllgatherSockReconnect()," |
| " peerTimedoutCount=%d\n" |
| , method_name, __LINE__ |
| , peerTimedoutCount ); |
| } |
| // First, check ability to connect to all peers |
| // An err returned will mean that connect failed with |
| // at least one peer. No err implies that possible network |
| // reset occurred and there is probably one dead connection |
| // to a peer where no IOs will complete ever, so connections |
| // to all peers must be reestablished. |
| err = AllgatherSockReconnect( stats, false ); |
| if (err == MPI_SUCCESS) |
| { // Connections to all peers are good |
| if (resetConnections) |
| { // Establish new connections on all peers |
| resetConnections = false; |
| err = AllgatherSockReconnect( stats, true ); |
| // Redrive IOs on new peer connections |
| nsent = 0; nrecv = 0; |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| peer = &p[indexToPnid_[i]]; |
| if ( indexToPnid_[i] == MyPNID || socks_[indexToPnid_[i]] == -1 ) |
| { // peer is me or not available |
| peer->p_sending = peer->p_receiving = false; |
| nsent++; |
| nrecv++; |
| } |
| else |
| { |
| peer->p_sending = peer->p_receiving = true; |
| peer->p_sent = peer->p_received = 0; |
| peer->p_n2recv = -1; |
| peer->p_buff = ((char *) rbuf) + (indexToPnid_[i] * CommBufSize); |
| struct epoll_event event; |
| event.data.fd = socks_[indexToPnid_[i]]; |
| event.events = EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP | EPOLLERR | EPOLLHUP; |
| EpollCtl( epollFD_, EPOLL_CTL_ADD, socks_[indexToPnid_[i]], &event ); |
| } |
| } |
| } // (resetConnections) |
| } // (err == MPI_SUCCESS) |
| else |
| { |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| peer = &p[indexToPnid_[i]]; |
| if ( indexToPnid_[i] != MyPNID && socks_[indexToPnid_[i]] == -1 ) |
| { // peer is me or no longer available |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY) && |
| (peer->p_sending || peer->p_receiving) ) |
| { |
| trace_printf( "%s@%d No IO completion on %s(%d):socks_[%d]=%d, " |
| "peer->p_sending=%d, " |
| "peer->p_receiving=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[i]]->GetName(), indexToPnid_[i] |
| , indexToPnid_[i] |
| , socks_[indexToPnid_[i]] |
| , peer->p_sending |
| , peer->p_receiving ); |
| } |
| if (peer->p_sending) |
| { |
| nsent++; |
| peer->p_sending = false; |
| } |
| if (peer->p_receiving) |
| { |
| peer->p_receiving = false; |
| nrecv++; |
| } |
| } |
| } |
| } |
| doReconnect = true; |
| } // (checkConnections) |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| peer = &p[indexToPnid_[i]]; |
| trace_printf( "%s@%d doReconnect=%d, %s(%d):socks_[%d]=%d, " |
| "peer->p_sending=%d, " |
| "peer->p_receiving=%d\n" |
| , method_name, __LINE__ |
| , doReconnect |
| , Node[indexToPnid_[i]]->GetName(), indexToPnid_[i] |
| , indexToPnid_[i] |
| , socks_[indexToPnid_[i]] |
| , peer->p_sending |
| , peer->p_receiving ); |
| } |
| } |
| |
| if (doReconnect) |
| { |
| reconnectSeqNum_ = seqNum_; |
| reconnecting = true; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d" " - Reconnecting! (reconnectSeqNum_=%lld)\n" |
| , method_name, __LINE__, reconnectSeqNum_ ); |
| } |
| goto reconnected; |
| } |
| } // ( nw == 0 ) |
| |
| if ( nw < 0 ) |
| { // Got an error |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] epoll_wait(%d, %d) error: %s\n", |
| method_name, __LINE__, epollFD_, maxEvents, |
| strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_3, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| |
| // Process fd's which are ready to initiate an IO or completed IO |
| for ( int iEvent = 0; iEvent < nw; iEvent++ ) |
| { |
| bool stateChange = false; |
| int fd = events[iEvent].data.fd; |
| int iPeer; |
| for ( iPeer = 0; iPeer < GetConfigPNodesCount(); iPeer++ ) |
| { // Find corresponding peer by matching socket fd |
| if ( events[iEvent].data.fd == socks_[indexToPnid_[iPeer]] ) break; |
| } |
| if ( indexToPnid_[iPeer] < 0 || indexToPnid_[iPeer] >= GetConfigPNodesMax() || indexToPnid_[iPeer] == MyPNID |
| || socks_[indexToPnid_[iPeer]] == -1 |
| || (!p[indexToPnid_[iPeer]].p_sending && !p[indexToPnid_[iPeer]].p_receiving) ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] Invalid peer %d, " |
| "peer.p_sending=%d, " |
| "peer.p_receiving=%d\n" |
| , method_name, __LINE__ |
| , indexToPnid_[iPeer] |
| , indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_sending |
| , indexToPnid_[iPeer] >= GetConfigPNodesMax()?-1:p[indexToPnid_[iPeer]].p_receiving ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_4, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| peer_t *peer = &p[indexToPnid_[iPeer]]; |
| if ( (events[iEvent].events & EPOLLERR) || |
| (events[iEvent].events & EPOLLHUP) || |
| ( !(events[iEvent].events & (EPOLLIN|EPOLLOUT))) ) |
| { |
| // An error has occurred on this fd, or the socket is not |
| // ready for reading nor writing |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] Error: peer=%d, events[%d].data.fd=%d, event[%d]=%s\n" |
| , method_name, __LINE__ |
| , indexToPnid_[iPeer] |
| , iEvent |
| , events[iEvent].data.fd |
| , iEvent |
| , EpollEventString(events[iEvent].events) ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_5, SQ_LOG_CRIT, buf ); |
| stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED; |
| err = MPI_ERR_IN_STATUS; |
| if ( peer->p_sending ) |
| { |
| peer->p_sending = false; |
| nsent++; |
| } |
| if ( peer->p_receiving ) |
| { |
| peer->p_receiving = false; |
| nrecv++; |
| } |
| stateChange = true; |
| goto early_exit; |
| } |
| if ( peer->p_receiving && events[iEvent].events & EPOLLIN ) |
| { // Got receive (read) completion |
| int eagain_ok = 0; |
| read_again: |
| char *r = &peer->p_buff[peer->p_received]; |
| int n2get; |
| if ( peer->p_received >= hdrSize ) |
| { |
| n2get = peer->p_n2recv; |
| } |
| else |
| { |
| n2get = hdrSize - peer->p_received; |
| } |
| int nr; |
| while ( 1 ) |
| { |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| { |
| trace_printf( "%s@%d - EPOLLIN from %s(%d)," |
| " sending=%d," |
| " receiving=%d (%d)" |
| " sent=%d," |
| " received=%d" |
| " timeout_count=%d," |
| " initial_check=%d," |
| " n2recv=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , peer->p_sending |
| , peer->p_receiving, n2get |
| , peer->p_sent |
| , peer->p_received |
| , peer->p_timeout_count |
| , peer->p_initial_check |
| , peer->p_n2recv ); |
| } |
| nr = recv( fd, r, n2get, 0 ); |
| if ( nr > 0 ) Meas.addSockAllGatherRcvdBytes( nr ); |
| if ( nr >= 0 || errno == EINTR ) break; |
| } |
| if ( nr < 0 ) |
| { |
| if ( nr < 0 && eagain_ok && errno == EAGAIN ) |
| { |
| // do nothing |
| } |
| else |
| { |
| // error, down socket |
| int err = errno; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] recv[%d](%d) error %d (%s)\n" |
| , method_name, __LINE__ |
| , indexToPnid_[iPeer], nr , err, strerror(err) ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_6, SQ_LOG_CRIT, buf ); |
| peer->p_receiving = false; |
| nrecv++; |
| if ( peer->p_sending ) |
| { |
| peer->p_sending = false; |
| nsent++; |
| } |
| stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED; |
| err = MPI_ERR_IN_STATUS; |
| stateChange = true; |
| } |
| } |
| else |
| { |
| peer->p_received += nr; |
| if ( peer->p_received < hdrSize ) |
| { |
| // do nothing |
| } |
| else |
| { |
| if ( peer->p_received == hdrSize ) |
| { |
| // got the complete header, get buffer size |
| struct sync_buffer_def *sb; |
| sb = (struct sync_buffer_def *)peer->p_buff; |
| peer->p_n2recv = sb->msgInfo.msg_offset; |
| if ( peer->p_n2recv ) |
| { |
| eagain_ok = 1; |
| goto read_again; |
| } |
| } |
| else |
| { |
| // reading buffer, update counters |
| peer->p_n2recv -= nr; |
| } |
| if ( peer->p_n2recv < 0 ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), |
| "[%s@%d] error n2recv %d\n", |
| method_name, __LINE__, peer->p_n2recv ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_7, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| if ( peer->p_n2recv == 0 ) |
| { |
| // this buffer is done |
| peer->p_receiving = false; |
| nrecv++; |
| stats[indexToPnid_[iPeer]].count = peer->p_received; |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| { |
| trace_printf( "%s@%d - EPOLLOUT to %s(%d)," |
| " sending=%d," |
| " receiving=%d (%d)" |
| " sent=%d," |
| " received=%d" |
| " timeout_count=%d," |
| " initial_check=%d," |
| " n2recv=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , peer->p_sending |
| , peer->p_receiving, n2get |
| , peer->p_sent |
| , peer->p_received |
| , peer->p_timeout_count |
| , peer->p_initial_check |
| , peer->p_n2recv ); |
| } |
| stateChange = true; |
| } |
| } |
| } |
| } |
| if ( peer->p_sending && events[iEvent].events & EPOLLOUT ) |
| { // Got send (write) completion |
| char *s = &((char *)sbuf)[peer->p_sent]; |
| int n2send = nbytes - peer->p_sent; |
| int ns; |
| while ( 1 ) |
| { |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| { |
| trace_printf( "%s@%d - EPOLLOUT to %s(%d)," |
| " sending=%d (%d)," |
| " receiving=%d" |
| " sent=%d," |
| " received=%d" |
| " timeout_count=%d," |
| " initial_check=%d," |
| " n2recv=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , peer->p_sending, n2send |
| , peer->p_receiving |
| , peer->p_sent |
| , peer->p_received |
| , peer->p_timeout_count |
| , peer->p_initial_check |
| , peer->p_n2recv ); |
| } |
| ns = send( fd, s, n2send, 0 ); |
| if ( ns > 0 ) Meas.addSockAllGatherSentBytes( ns ); |
| if ( ns >= 0 || errno != EINTR ) break; |
| } |
| if ( ns < 0 ) |
| { |
| // error, down socket |
| int err = errno; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] send[%d](%d) error=%d (%s)\n" |
| , method_name, __LINE__ |
| , indexToPnid_[iPeer], ns, err, strerror(err) ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCK_8, SQ_LOG_CRIT, buf ); |
| peer->p_sending = false; |
| nsent++; |
| if ( peer->p_receiving ) |
| { |
| peer->p_receiving = false; |
| nrecv++; |
| } |
| stats[indexToPnid_[iPeer]].MPI_ERROR = MPI_ERR_EXITED; |
| err = MPI_ERR_IN_STATUS; |
| stateChange = true; |
| } |
| else |
| { |
| peer->p_sent += ns; |
| if ( peer->p_sent == nbytes ) |
| { |
| // finished sending to this destination |
| peer->p_sending = false; |
| nsent++; |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| { |
| trace_printf( "%s@%d - EPOLLOUT to %s(%d)," |
| " sending=%d (%d)," |
| " receiving=%d" |
| " sent=%d," |
| " received=%d" |
| " timeout_count=%d," |
| " initial_check=%d," |
| " n2recv=%d\n" |
| , method_name, __LINE__ |
| , Node[indexToPnid_[iPeer]]->GetName(), indexToPnid_[iPeer] |
| , peer->p_sending, n2send |
| , peer->p_receiving |
| , peer->p_sent |
| , peer->p_received |
| , peer->p_timeout_count |
| , peer->p_initial_check |
| , peer->p_n2recv ); |
| } |
| stateChange = true; |
| } |
| } |
| } |
| early_exit: |
| if ( stateChange ) |
| { |
| struct epoll_event event; |
| event.data.fd = socks_[indexToPnid_[iPeer]]; |
| int op = 0; |
| if ( !peer->p_sending && !peer->p_receiving ) |
| { |
| op = EPOLL_CTL_DEL; |
| event.events = 0; |
| } |
| else if ( peer->p_sending ) |
| { |
| op = EPOLL_CTL_MOD; |
| event.events = EPOLLOUT | EPOLLET | EPOLLRDHUP; |
| } |
| else if ( peer->p_receiving ) |
| { |
| op = EPOLL_CTL_MOD; |
| event.events = EPOLLIN | EPOLLET | EPOLLRDHUP; |
| } |
| if ( op == EPOLL_CTL_DEL || op == EPOLL_CTL_MOD ) |
| { |
| EpollCtl( epollFD_, op, fd, &event ); |
| } |
| } |
| } |
| } |
| |
| MonStats->BarrierWaitDecr( ); |
| inBarrier_ = false; |
| |
| barrierCount_++; |
| |
| TRACE_EXIT; |
| return err; |
| } |
| |
| int CCluster::AllgatherSockReconnect( MPI_Status *stats, bool reestablishConnections ) |
| { |
| const char method_name[] = "CCluster::AllgatherSockReconnect"; |
| TRACE_ENTRY; |
| |
| int err = MPI_SUCCESS; |
| int idst; |
| int reconnectSock = -1; |
| CNode *node; |
| |
| // Loop on each node in the cluster |
| for ( int i = 0; i < GetConfigPNodesMax(); i++ ) |
| { |
| // Loop on each adjacent node in the cluster |
| for ( int j = i+1; j < GetConfigPNodesMax(); j++ ) |
| { |
| if ( i == MyPNID ) |
| { // Current [i] node is my node, so connect to [j] node |
| |
| idst = j; |
| node = Nodes->GetNode( idst ); |
| if (!node) continue; |
| if (node->GetState() != State_Up) |
| { |
| if (socks_[idst] != -1) |
| { // Peer socket is still active |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst, socks_[idst] ); |
| } |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| |
| --currentNodes_; |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK)); |
| |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[idst]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[idst], &event ); |
| socks_[idst] = -1; |
| } |
| continue; |
| } |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() ); |
| } |
| if (PingSockPeer(node)) |
| { |
| reconnectSock = ConnectSockPeer( node, idst, reestablishConnections ); |
| if (reconnectSock == -1) |
| { |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| } |
| } |
| else |
| { |
| if (socks_[idst] != -1) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst, socks_[idst] ); |
| } |
| |
| --currentNodes_; |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK)); |
| |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[idst]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[idst], &event ); |
| socks_[idst] = -1; |
| } |
| reconnectSock = -1; |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| } |
| } |
| else if ( j == MyPNID ) |
| { // Current [j] is my node, accept connection from peer [i] node |
| |
| idst = i; |
| node = Nodes->GetNode( idst ); |
| if (!node) continue; |
| if (node->GetState() != State_Up) |
| { |
| if (socks_[idst] != -1) |
| { // Peer socket is still active |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst, socks_[idst] ); |
| } |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| |
| --currentNodes_; |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK)); |
| |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[idst]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[idst], &event ); |
| socks_[idst] = -1; |
| } |
| continue; |
| } |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Pinging Node %s (%d) to see if it's up\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() ); |
| } |
| if (PingSockPeer(node)) |
| { |
| reconnectSock = AcceptSockPeer( node, idst, reestablishConnections ); |
| if (reconnectSock == -1) |
| { |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| } |
| } |
| else |
| { |
| if (socks_[idst] != -1) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst, socks_[idst] ); |
| } |
| |
| --currentNodes_; |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[idst/MAX_NODE_BITMASK] &= ~(1ull << (idst%MAX_NODE_BITMASK)); |
| |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[idst]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[idst], &event ); |
| socks_[idst] = -1; |
| } |
| reconnectSock = -1; |
| stats[idst].MPI_ERROR = MPI_ERR_EXITED; |
| stats[idst].count = 0; |
| err = MPI_ERR_IN_STATUS; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Setting Node %s (%d) status to " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , idst |
| , ErrorMsg(stats[idst].MPI_ERROR) ); |
| } |
| } |
| } |
| else |
| { |
| idst = -1; |
| } |
| if ( idst >= 0 |
| && reconnectSock != -1 |
| && socks_[idst] != -1 |
| && fcntl( socks_[idst], F_SETFL, O_NONBLOCK ) ) |
| { |
| err = MPI_ERR_AMODE; |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] fcntl(socks_[%d]=%d,F_SETFL,NONBLOCK) error: %s\n", |
| method_name, __LINE__,idst, socks_[idst], strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_ALLGATHERSOCKRECONN_1, SQ_LOG_CRIT, buf ); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| for ( int i = 0; i < GetConfigPNodesCount(); i++ ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d" " - socks_[%d]=%d, " |
| "stats[%d].MPI_ERROR=%s\n" |
| , method_name, __LINE__ |
| , indexToPnid_[i] |
| , socks_[indexToPnid_[i]] |
| , indexToPnid_[i] |
| , ErrorMsg(stats[indexToPnid_[i]].MPI_ERROR) ); |
| } |
| } |
| trace_printf( "%s@%d - Returning err=%d\n" |
| , method_name, __LINE__, err ); |
| } |
| |
| TRACE_EXIT; |
| return( err ); |
| } |
| |
| int CCluster::AcceptSockPeer( CNode *node, int peer, bool reestablishConnections ) |
| { |
| const char method_name[] = "CCluster::AcceptSockPeer"; |
| TRACE_ENTRY; |
| |
| int rc = MPI_SUCCESS; |
| int reconnectSock = -1; |
| struct hostent *he; |
| |
| // Get my host structure via my node name |
| he = gethostbyname( MyNode->GetName() ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] gethostbyname(%s) error: %s\n" |
| , method_name, __LINE__ |
| , MyNode->GetName() |
| , strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_1, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf( "%s@%d Accepting server socket: from %s(%d), port=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , MyNode->GetSyncSocketPort() ); |
| } |
| |
| // Accept connection from peer |
| reconnectSock = AcceptSock( syncSock_ ); |
| if (reconnectSock != -1) |
| { |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf( "%s@%d Server %s(%d) accepted from client %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , node->GetName(), node->GetPNid() |
| , peer, socks_[peer] |
| , peer, reconnectSock); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] AcceptSock(%d) failed!\n", |
| method_name, __LINE__, syncSock_ ); |
| mon_log_write( MON_CLUSTER_ACCEPTSOCKPEER_2, SQ_LOG_ERR, buf ); |
| rc = -1; |
| } |
| |
| if (reestablishConnections) |
| { |
| if (socks_[peer] != -1) |
| { |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[peer]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[peer], &event ); |
| if (node->GetState() != State_Up) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , peer, socks_[peer] ); |
| } |
| socks_[peer] = -1; |
| } |
| } |
| if (reconnectSock != -1) |
| { |
| socks_[peer] = reconnectSock; // AcceptSockPeer |
| } |
| } |
| else |
| { |
| if (reconnectSock != -1) |
| { |
| close( (int)reconnectSock ); |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| return rc; |
| } |
| |
| int CCluster::ConnectSockPeer( CNode *node, int peer, bool reestablishConnections ) |
| { |
| const char method_name[] = "CCluster::ConnectSockPeer"; |
| TRACE_ENTRY; |
| |
| int rc = MPI_SUCCESS; |
| int reconnectSock = -1; |
| unsigned char srcaddr[4], dstaddr[4]; |
| struct hostent *he; |
| |
| // Get my host structure via my node name |
| he = gethostbyname( MyNode->GetName() ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] gethostbyname(%s) error: %s\n" |
| , method_name, __LINE__ |
| , MyNode->GetName() |
| , strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_1, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| // Initialize my source address structure |
| memcpy( srcaddr, he->h_addr, 4 ); |
| // Get peer's host structure via its node name |
| he = gethostbyname( node->GetName() ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), |
| "[%s@%d] gethostbyname(%s) error: %s\n", |
| method_name, __LINE__, node->GetName(), |
| strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_2, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| // Initialize peer's destination address structure |
| memcpy( dstaddr, he->h_addr, 4 ); |
| |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf( "%s@%d Creating client socket: src=%d.%d.%d.%d, " |
| "dst(%s)=%d.%d.%d.%d, dst port=%d\n" |
| , method_name, __LINE__ |
| , (int)((unsigned char *)srcaddr)[0] |
| , (int)((unsigned char *)srcaddr)[1] |
| , (int)((unsigned char *)srcaddr)[2] |
| , (int)((unsigned char *)srcaddr)[3] |
| , node->GetName() |
| , (int)((unsigned char *)dstaddr)[0] |
| , (int)((unsigned char *)dstaddr)[1] |
| , (int)((unsigned char *)dstaddr)[2] |
| , (int)((unsigned char *)dstaddr)[3] |
| , sockPorts_[peer] ); |
| } |
| // Connect to peer |
| reconnectSock = MkCltSock( srcaddr, dstaddr, sockPorts_[peer] ); |
| if (reconnectSock != -1) |
| { |
| if (trace_settings & TRACE_RECOVERY) |
| { |
| trace_printf( "%s@%d Client %s(%d) connected to server %s(%d), old socks_[%d]=%d, new socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , MyNode->GetName(), MyPNID |
| , node->GetName(), node->GetPNid() |
| , peer, socks_[peer] |
| , peer, reconnectSock); |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] MkCltSock() src=%d.%d.%d.%d, " |
| "dst(%s)=%d.%d.%d.%d failed!\n" |
| , method_name, __LINE__ |
| , (int)((unsigned char *)srcaddr)[0] |
| , (int)((unsigned char *)srcaddr)[1] |
| , (int)((unsigned char *)srcaddr)[2] |
| , (int)((unsigned char *)srcaddr)[3] |
| , node->GetName() |
| , (int)((unsigned char *)dstaddr)[0] |
| , (int)((unsigned char *)dstaddr)[1] |
| , (int)((unsigned char *)dstaddr)[2] |
| , (int)((unsigned char *)dstaddr)[3] ); |
| mon_log_write( MON_CLUSTER_CONNECTSOCKPEER_3, SQ_LOG_ERR, buf ); |
| rc = -1; |
| } |
| |
| if (reestablishConnections) |
| { |
| if (socks_[peer] != -1) |
| { |
| // Remove old socket from epoll set, it may not be there |
| struct epoll_event event; |
| event.data.fd = socks_[peer]; |
| event.events = 0; |
| EpollCtlDelete( epollFD_, socks_[peer], &event ); |
| if (node->GetState() != State_Up) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Node %s (%d) is not up, " |
| "removing old socket from epoll set, " |
| "socks_[%d]=%d\n" |
| , method_name, __LINE__ |
| , node->GetName(), node->GetPNid() |
| , peer, socks_[peer] ); |
| } |
| socks_[peer] = -1; |
| } |
| } |
| if (reconnectSock != -1) |
| { |
| socks_[peer] = reconnectSock; // ConnectSockPeer |
| } |
| } |
| else |
| { |
| if (reconnectSock != -1) |
| { |
| close( (int)reconnectSock ); |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| return( rc ); |
| } |
| |
| // When we get a communication error for a point-to-point monitor communicator |
| // verify that the other nodes in the cluster also lost communications |
| // with that monitor. If all nodes concur we consider that monitor |
| // down. |
| void CCluster::ValidateClusterState( cluster_state_def_t nodestate[], |
| bool haveDivergence) |
| { |
| const char method_name[] = "CCluster::ValidateClusterState"; |
| |
| exitedMons_t::iterator it; |
| upNodes_t nodeMask; |
| |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| nodeMask.upNodes[i] = 0; |
| } |
| |
| for ( it = exitedMons_.begin(); it != exitedMons_.end(); ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d checking exited pnid=%d, detecting pnid=%d, seqNum=%lld" |
| " (current seqNum_=%lld)\n", method_name, __LINE__, |
| it->exitedPnid, it->detectingPnid, it->seqNum, seqNum_); |
| } |
| |
| if ( seqNum_ >= (it->seqNum + 2) ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s] Validating exited node %d, " |
| "detected by node %d at seq #%lld " |
| "(current seq # is %lld).\n", |
| method_name, it->exitedPnid, it->detectingPnid, |
| it->seqNum, seqNum_); |
| mon_log_write(MON_CLUSTER_VALIDATE_STATE_1, SQ_LOG_ERR, buf); |
| |
| int concurringNodes = 0; |
| |
| // Check if all active nodes see the node as down. |
| nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] = 1ull << (it->exitedPnid%MAX_NODE_BITMASK); |
| string setSeesUp; |
| string setSeesDown; |
| char nodeX[10]; |
| |
| // Evaluate each active (up) node in the cluster |
| int pnodesCount = 0; |
| for (int index = 0; |
| index < GetConfigPNodesMax() && pnodesCount < currentNodes_; |
| ++index) |
| { |
| if ( nodestate[index].seq_num != 0 ) |
| { // There is valid nodestate info from node "index" |
| |
| pnodesCount++; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d down pnid= %d: nodestate[%d].nodeMask.upNodes[%d]=%llx, downNodeMask.upNodes[%d]=%llx\n" |
| , method_name, __LINE__ |
| , it->exitedPnid |
| , index, (it->exitedPnid/MAX_NODE_BITMASK) |
| , nodestate[index].nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] |
| , (index/MAX_NODE_BITMASK) |
| , nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] ); |
| } |
| |
| if ((nodestate[index].nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK] & |
| nodeMask.upNodes[it->exitedPnid/MAX_NODE_BITMASK]) == 0) |
| { // Node "pnid" sees the node as down |
| |
| // temp trace |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d node %d concurs that node %d " |
| "is down\n", method_name, __LINE__, |
| /*indexToPnid_[index]*/ index, it->exitedPnid); |
| } |
| |
| snprintf(nodeX, sizeof(nodeX), "%d, ", /*indexToPnid_[index]*/ index); |
| setSeesDown.append(nodeX); |
| |
| ++concurringNodes; |
| } |
| else |
| { |
| // temp trace |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d node %d says node %d is up\n", |
| method_name, __LINE__, /*indexToPnid_[index]*/ index, |
| it->exitedPnid); |
| } |
| |
| snprintf(nodeX, sizeof(nodeX), "%d, ", /*indexToPnid_[index]*/ index); |
| setSeesUp.append(nodeX); |
| |
| } |
| } |
| else |
| { |
| // temp trace |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d ignoring state from node %d\n", |
| method_name, __LINE__, /*indexToPnid_[index]*/ index); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d concurringNodes=%d, currentNodes_=%d\n", |
| method_name, __LINE__, concurringNodes, currentNodes_); |
| } |
| |
| if (concurringNodes == currentNodes_) |
| { // General agreement that node is down, proceed to mark it down |
| |
| CNode *downNode = Nodes->GetNode( it->exitedPnid ); |
| if (downNode && downNode->GetState() != State_Down) |
| { |
| // temp trace |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d proceeding to mark node %d down\n", |
| method_name, __LINE__, it->exitedPnid); |
| } |
| |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_3, it->exitedPnid); |
| |
| HandleDownNode(it->exitedPnid); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d Node pnid=%d is already down\n" |
| , method_name, __LINE__ |
| , it->exitedPnid); |
| } |
| } |
| } |
| else if ( concurringNodes != 0 && !enqueuedDown_ ) |
| { // Some monitors say the node is down, others don't. |
| // This is not supposed to happen. Enqueue request to |
| // bring this node down. All monitors will do the same |
| // so the cluster will be brought down. |
| if (setSeesUp.length() > 2) |
| setSeesUp.erase(setSeesUp.length()-2, 2); |
| if (setSeesDown.length() > 2) |
| setSeesDown.erase(setSeesDown.length()-2, 2); |
| char buf[MON_STRING_BUF_SIZE*2]; |
| snprintf( buf, sizeof(buf), "[%s] Lost connection to node " |
| "%d but only %d of %d nodes also lost the " |
| "connection. See up: %s. See down: %s. So node " |
| "%d is going down (at seq #%lld).\n", method_name, |
| it->exitedPnid, concurringNodes, currentNodes_, |
| setSeesUp.c_str(), setSeesDown.c_str(), |
| MyPNID, seqNum_ ); |
| mon_log_write(MON_CLUSTER_VALIDATE_STATE_2, SQ_LOG_ERR, buf); |
| |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_4, MyPNID, |
| it->exitedPnid); |
| |
| enqueuedDown_ = true; |
| ReqQueue.enqueueDownReq(MyPNID); |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d removing exited pnid=%d, detecting pnid=%d, seqNum=%lld" |
| " (current seqNum_=%lld)\n", method_name, __LINE__, |
| it->exitedPnid, it->detectingPnid, it->seqNum, seqNum_); |
| } |
| // Delete current list element and advance to next one |
| it = exitedMons_.erase( it ); |
| } |
| else |
| { // Advance to next list element |
| ++it; |
| } |
| } |
| |
| |
| if ( haveDivergence ) |
| { |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s] Cluster view divergence (at seq #%lld), " |
| "node %d sees set[%d]: %llx\n" |
| , method_name, seqNum_, MyPNID, i |
| , upNodes_.upNodes[i] ); |
| mon_log_write(MON_CLUSTER_VALIDATE_STATE_3, SQ_LOG_ERR, buf); |
| } |
| |
| // For each "up node" (from local perspective) |
| // go through nodestate for each other node. If any node |
| // says the node is down, add an item to the exitedMons_ list |
| // for examination during the next sync cycle (by which time |
| // all nodes will have had a chance to detect the down monitor.) |
| |
| int pnodesCount2 = 0; |
| for (int remIndex = 0; |
| remIndex < GetConfigPNodesMax() && pnodesCount2 < currentNodes_; |
| ++remIndex) |
| { |
| bool someExited = false; |
| // No need to check local monitor's view of the cluster since |
| // any down connections are handled directly when detected. |
| if (/*indexToPnid_[remIndex]*/remIndex == MyPNID) |
| { |
| pnodesCount2++; |
| continue; |
| } |
| |
| // No need to check a remote monitor's view when node is down |
| CNode *remoteNode = Nodes->GetNode( /*indexToPnid_[remIndex]*/remIndex ); |
| if ( ! remoteNode ) |
| { // node is not member of cluster |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d Skipping non-existing node " |
| "pnid=%d\n", |
| method_name, __LINE__, |
| /*indexToPnid_[remIndex]*/remIndex); |
| } |
| continue; |
| } |
| else if (remoteNode->GetState() == State_Down) |
| { // node is down |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d Skipping down node " |
| "pnid=%d (%s)\n", |
| method_name, __LINE__, |
| /*indexToPnid_[remIndex]*/remIndex, remoteNode->GetName()); |
| } |
| continue; |
| } |
| else |
| { |
| pnodesCount2++; |
| } |
| |
| // Check if all active nodes see the node as up. |
| nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] = |
| 1ull << (/*indexToPnid_[remIndex]*/remIndex%MAX_NODE_BITMASK); |
| |
| if ( upNodes_.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] & |
| nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] ) |
| { // This remote node sees node pnid as up |
| int pnodesCount3 = 0; |
| for (int exitedPNid = 0; |
| exitedPNid < GetConfigPNodesMax() && pnodesCount3 < currentNodes_; |
| ++exitedPNid) |
| { |
| CNode *exitedNode = Nodes->GetNode( /*indexToPnid_[remIndex]*/exitedPNid ); |
| if ( exitedNode && |
| (/*indexToPnid_[remIndex]*/remIndex != exitedPNid) && |
| (nodestate[remIndex].seq_num != 0) && |
| (nodestate[exitedPNid].nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK] & |
| nodeMask.upNodes[/*indexToPnid_[remIndex]*/remIndex/MAX_NODE_BITMASK]) == 0 ) |
| { // Node remIndex sees exitedPNid as down |
| |
| pnodesCount3++; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d Divergence, queueing " |
| "monExited{%d, %d, %lld}\n", |
| method_name, __LINE__, exitedPNid, /*indexToPnid_[remIndex]*/remIndex, |
| seqNum_); |
| } |
| |
| someExited = true; |
| monExited_t monExited = {exitedPNid, /*indexToPnid_[remIndex]*/remIndex, seqNum_}; |
| exitedMons_.push_back( monExited ); |
| } |
| } |
| } |
| if (someExited) |
| { |
| // No need to look further for any other |
| // monitor's view of node pnid. When the |
| // exitedMons_ element is processed all nodes |
| // will be checked for concurrence. |
| break; |
| } |
| } |
| } |
| } |
| |
| bool CCluster::ValidateSeqNum( cluster_state_def_t nodestate[] ) |
| { |
| const char method_name[] = "CCluster::ValidateSeqNum"; |
| |
| unsigned long long seqNum; |
| unsigned long long loSeqNum = seqNum_; |
| unsigned long long hiSeqNum = seqNum_; |
| unsigned long long seqNumBucket[MAX_NODES]; |
| int seqNumCount[MAX_NODES]; |
| int maxBucket = 0; |
| bool found; |
| int mostCountsIndex; |
| |
| if ( GetConfigPNodesCount() == 1 ) return true; |
| |
| // Count occurrences of sequence numbers |
| for (int pnid = 0; pnid < GetConfigPNodesMax(); pnid++) |
| { |
| CNode *node = Nodes->GetNode( pnid ); |
| if (!node) continue; |
| if (node->GetState() != State_Up) continue; |
| |
| if ( pnid == MyPNID ) |
| { |
| seqNum = nodestate[pnid].seq_num = seqNum_; |
| } |
| else |
| { |
| seqNum = nodestate[pnid].seq_num; |
| } |
| |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf( "%s@%d seqNum_=%lld, nodestate[%d].seq_num=%lld\n" |
| , method_name, __LINE__ |
| , seqNum_ |
| , pnid |
| , nodestate[pnid].seq_num ); |
| } |
| |
| if (seqNum != 0) |
| { |
| loSeqNum = (seqNum < loSeqNum) ? seqNum : loSeqNum; |
| hiSeqNum = (seqNum > hiSeqNum) ? seqNum : hiSeqNum; |
| |
| found = false; |
| for (int i=0; i<maxBucket; ++i) |
| { |
| if ( seqNum == seqNumBucket[i] ) |
| { |
| ++seqNumCount[i]; |
| found = true; |
| break; |
| } |
| } |
| if ( ! found ) |
| { |
| seqNumBucket[maxBucket] = seqNum; |
| seqNumCount[maxBucket] = 1; |
| ++maxBucket; |
| } |
| } |
| } |
| |
| if ( maxBucket == 0 ) |
| { // Normal case, all nodes have same sequence number |
| mostCountsIndex = 0; |
| } |
| else |
| { // Look for majority sequence number |
| int mostCounts = 0; |
| mostCountsIndex = 0; |
| for (int i=0; i<maxBucket; ++i) |
| { |
| if ( seqNumCount[i] > mostCounts ) |
| { |
| mostCounts = seqNumCount[i]; |
| mostCountsIndex = i; |
| } |
| } |
| } |
| |
| lowSeqNum_ = loSeqNum; |
| highSeqNum_ = hiSeqNum; |
| |
| if (trace_settings & TRACE_SYNC) |
| { |
| if ( lowSeqNum_ != highSeqNum_ ) |
| { |
| trace_printf( "%s@%d Most common seq num=%lld (%d nodes), " |
| "%d buckets, low=%lld, high=%lld, local seq num (%lld) did not match.\n" |
| , method_name, __LINE__ |
| , seqNumBucket[mostCountsIndex] |
| , seqNumCount[mostCountsIndex] |
| , maxBucket |
| , lowSeqNum_ |
| , highSeqNum_ |
| , seqNum_ ); |
| } |
| } |
| |
| // Fail if any sequence number does not match |
| return( lowSeqNum_ == highSeqNum_ ); |
| } |
| |
| void CCluster::HandleDownNode( int pnid ) |
| { |
| const char method_name[] = "CCluster::HandleDownNode"; |
| TRACE_ENTRY; |
| |
| // Add to dead node name list |
| CNode *downNode = Nodes->GetNode( pnid ); |
| assert(downNode); |
| deadNodeList_.push_back( downNode ); |
| |
| if (trace_settings & TRACE_INIT) |
| trace_printf("%s@%d - Added down node to list, pnid=%d, name=(%s)\n", method_name, __LINE__, downNode->GetPNid(), downNode->GetName()); |
| |
| // assign new leaders if needed |
| AssignLeaders( pnid, downNode->GetName(), false ); |
| |
| // Build available list of spare nodes |
| CNode *spareNode; |
| NodesList *spareNodesList = Nodes->GetSpareNodesList(); |
| NodesList::iterator itSn; |
| for ( itSn = spareNodesList->begin(); itSn != spareNodesList->end() ; itSn++ ) |
| { |
| spareNode = *itSn; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - %s (pnid=%d) is in available spare node list, state=%s, spare=%d, rank failure=%d\n" |
| , method_name, __LINE__, spareNode->GetName(), spareNode->GetPNid() |
| , StateString(spareNode->GetState()), spareNode->IsSpareNode(), spareNode->IsRankFailure()); |
| // if spare node is available |
| if ( spareNode->IsSpareNode() && |
| !spareNode->IsRankFailure() && |
| spareNode->GetState() == State_Up ) |
| { |
| spareNodeVector_.push_back( spareNode ); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - pnid=%d, name=(%s) is available Spare\n", method_name, __LINE__, spareNode->GetPNid(), spareNode->GetName()); |
| } |
| } |
| |
| // Activate spare or down node |
| NodesList::iterator itDn; |
| for ( itDn = deadNodeList_.begin(); itDn != deadNodeList_.end() ; itDn++ ) |
| { |
| downNode = *itDn; |
| if ( Emulate_Down ) |
| { |
| ReqQueue.enqueueDownReq( downNode->GetPNid() ); |
| } |
| else |
| { |
| bool done = false; |
| spareNode = NULL; |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - spare node vector size=%ld\n" |
| , method_name, __LINE__, spareNodeVector_.size()); |
| // Find available spare node for current down node |
| for ( unsigned int ii = 0; ii < spareNodeVector_.size() && !done ; ii++ ) |
| { |
| PNidVector sparePNids = spareNodeVector_[ii]->GetSparePNids(); |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - spare pnids vector size=%ld\n" |
| , method_name, __LINE__, sparePNids.size()); |
| // Check each pnid it is configured to spare |
| for ( unsigned int jj = 0; jj < sparePNids.size(); jj++ ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d - %s (pnid=%d) is in spare node vector[%d], size=%ld\n" |
| , method_name, __LINE__ |
| , spareNodeVector_[ii]->GetName() |
| , spareNodeVector_[ii]->GetPNid() |
| , jj, sparePNids.size()); |
| // if this is a spare for the down node |
| if ( spareNodeVector_[ii]->IsSpareNode() && |
| downNode->GetPNid() == sparePNids[jj] ) |
| { |
| // assign it and remove it from the vector |
| spareNode = spareNodeVector_[ii]; |
| spareNodeVector_.erase( spareNodeVector_.begin() + ii ); |
| done = true; |
| break; |
| } |
| } |
| } |
| |
| if ( spareNode ) |
| { |
| Nodes->RemoveFromSpareNodesList( spareNode ); |
| downNode->SetState( State_Takeover ); // change state so that pending requests could fail. |
| spareNode->SetActivatingSpare( true ); |
| if ( spareNode->GetPNid() == MyPNID ) |
| { |
| ReqQueue.enqueueActivateSpareReq( spareNode, downNode, true ); |
| } |
| } |
| else |
| { |
| if ( downNode->IsSpareNode() ) |
| { |
| Nodes->RemoveFromSpareNodesList( downNode ); |
| } |
| ReqQueue.enqueueDownReq( downNode->GetPNid() ); |
| } |
| } |
| } |
| |
| spareNodeVector_.clear(); |
| deadNodeList_.clear(); |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::UpdateClusterState( bool &doShutdown, |
| struct sync_buffer_def * syncBuf, |
| MPI_Status *status, |
| int sentChangeNid) |
| { |
| const char method_name[] = "CCluster::UpdateClusterState"; |
| TRACE_ENTRY; |
| |
| struct sync_buffer_def *recvBuf; |
| #ifndef NAMESERVER_PROCESS |
| struct sync_buffer_def *sendBuf = Nodes->GetSyncBuffer(); |
| #endif |
| STATE node_state; |
| int change_nid; |
| cluster_state_def_t nodestate[GetConfigPNodesMax()]; |
| bool clusterViewDivergence = false; |
| |
| |
| // Populate nodestate array using node state info from "allgather" |
| // along with local node state. |
| for (int index = 0; index < GetConfigPNodesMax(); index++) |
| { |
| // Only process active nodes |
| bool noComm; |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| noComm = (comms_[index] == MPI_COMM_NULL) ? true : false; |
| break; |
| case CommType_Sockets: |
| noComm = (socks_[index] == -1) ? true : false; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| |
| if (noComm |
| || status[index].MPI_ERROR != MPI_SUCCESS) |
| { |
| if (trace_settings & (TRACE_RECOVERY | TRACE_INIT)) |
| { |
| if (!noComm) |
| { |
| trace_printf( "%s@%d - Communication error from node %d, " |
| " seq_num=#%lld\n" |
| , method_name, __LINE__, index |
| , seqNum_ ); |
| } |
| } |
| // Not an active node, set default values |
| nodestate[index].node_state = State_Unknown; |
| nodestate[index].change_nid = -1; |
| nodestate[index].seq_num = 0; |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| nodestate[index].nodeMask.upNodes[i] = 0; |
| } |
| #ifdef NAMESERVER_PROCESS |
| nodestate[index].monConnCount = -1; |
| #else |
| nodestate[index].monProcCount = 0; |
| #endif |
| |
| continue; |
| } |
| |
| recvBuf = (struct sync_buffer_def *) |
| (((char *) syncBuf) + index * CommBufSize); |
| |
| if (trace_settings & TRACE_SYNC) |
| { |
| int nr; |
| MPI_Get_count(&status[index], MPI_CHAR, &nr); |
| trace_printf("%s@%d - Received %d bytes from node %d, " |
| ", seq_num=%lld, message count=%d\n", |
| method_name, __LINE__, nr, index, |
| recvBuf->nodeInfo.seq_num, |
| recvBuf->msgInfo.msg_count); |
| } |
| |
| nodestate[index].node_state = recvBuf->nodeInfo.node_state; |
| nodestate[index].change_nid = recvBuf->nodeInfo.change_nid; |
| nodestate[index].seq_num = recvBuf->nodeInfo.seq_num; |
| nodestate[index].nodeMask = recvBuf->nodeInfo.nodeMask; |
| #ifdef NAMESERVER_PROCESS |
| nodestate[index].monConnCount = recvBuf->nodeInfo.monConnCount; |
| #else |
| nodestate[index].monProcCount = recvBuf->nodeInfo.monProcCount; |
| #endif |
| |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| if ( nodestate[index].nodeMask.upNodes[i] != upNodes_.upNodes[i] ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| for ( int j =0; j < MAX_NODE_MASKS ; j++ ) |
| { |
| trace_printf( "%s@%d - Divergence (at seq #%lld), node %s " |
| "(pnid=%d) sees cluster state[%d] %llx, local " |
| "monitor sees %llx\n" |
| , method_name, __LINE__ |
| , seqNum_ |
| , Node[index]->GetName() |
| , index |
| , j |
| , nodestate[index].nodeMask.upNodes[j] |
| , upNodes_.upNodes[j] ); |
| } |
| } |
| clusterViewDivergence = true; |
| } |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Node %s (pnid=%d) TmSyncState=(%d)(%s)\n" |
| , method_name, __LINE__ |
| , Node[index]->GetName() |
| , index |
| , recvBuf->nodeInfo.tmSyncState |
| , SyncStateString( recvBuf->nodeInfo.tmSyncState )); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( Node[index]->GetTmSyncState() != recvBuf->nodeInfo.tmSyncState ) |
| { |
| Node[index]->SetTmSyncState(recvBuf->nodeInfo.tmSyncState); |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated" |
| " (%d)(%s)\n", method_name, __LINE__, |
| Node[index]->GetName(), index, |
| recvBuf->nodeInfo.tmSyncState, |
| SyncStateString( recvBuf->nodeInfo.tmSyncState )); |
| } |
| } |
| #endif |
| |
| // Check if we need to increase my node's shutdown level ... |
| // all nodes should be at the highest level selected from any source |
| if ( MyNode->GetShutdownLevel() < recvBuf->nodeInfo.sdLevel ) |
| { |
| MyNode->SetShutdownLevel( recvBuf->nodeInfo.sdLevel ); |
| if (MyNode->GetState() == State_Up) |
| { |
| MyNode->SetState( State_Shutdown ); |
| } |
| if (trace_settings & (TRACE_REQUEST | TRACE_SYNC)) |
| trace_printf("%s@%d - Node %s Shutdown Level updated (%d)\n", |
| method_name, __LINE__, |
| Node[index]->GetName(), recvBuf->nodeInfo.sdLevel); |
| } |
| |
| Node[index]->SetInternalState( recvBuf->nodeInfo.internalState ); |
| if ( recvBuf->nodeInfo.internalState == State_Ready_To_Exit ) |
| { // The node is exiting. Don't communicate with it any more. |
| if (trace_settings & (TRACE_REQUEST | TRACE_SYNC)) |
| trace_printf("%s@%d - Node %s (%d) ready to exit, setting comm " |
| "to null\n", method_name, __LINE__, |
| Node[index]->GetName(), index); |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| MPI_Comm_free( &comms_[index] ); |
| break; |
| case CommType_Sockets: |
| shutdown( socks_[index], SHUT_RDWR ); |
| close( socks_[index] ); |
| socks_[index] = -1; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| Node[index]->SetState( State_Down ); |
| --currentNodes_; |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK)); |
| } |
| } |
| |
| if ( (checkSeqNum_ || reconnectSeqNum_ != 0) |
| && !ValidateSeqNum( nodestate ) |
| && !enqueuedDown_ ) |
| { |
| if ( reconnectSeqNum_ == 0 && MyNode->GetState() == State_Up ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] Sync cycle sequence number (%lld) " |
| "incorrect. Aborting!\n", method_name, seqNum_); |
| mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_1, SQ_LOG_CRIT, buf); |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_2, MyPNID); |
| abort(); |
| } |
| } |
| |
| nodestate[MyPNID].node_state = Node[MyPNID]->GetState(); |
| nodestate[MyPNID].change_nid = sentChangeNid; |
| nodestate[MyPNID].seq_num = seqNum_; |
| nodestate[MyPNID].nodeMask = upNodes_; |
| #ifdef NAMESERVER_PROCESS |
| nodestate[MyPNID].monConnCount = Node[MyPNID]->GetMonConnCount(); |
| #else |
| nodestate[MyPNID].monProcCount = Node[MyPNID]->GetNumProcs(); |
| #endif |
| |
| // Examine status returned from MPI receive requests |
| for (int index = 0; index < GetConfigPNodesMax(); index++) |
| { |
| bool noComm; |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| noComm = (comms_[index] == MPI_COMM_NULL) ? true : false; |
| break; |
| case CommType_Sockets: |
| noComm = (socks_[index] == -1) ? true : false; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| if (noComm) continue; |
| |
| if (status[index].MPI_ERROR != MPI_SUCCESS) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s] MPI communications error=%d " |
| "(%s) for node %d (at seq #%lld).\n", method_name, |
| status[index].MPI_ERROR, ErrorMsg(status[index].MPI_ERROR), |
| index, seqNum_); |
| mon_log_write(MON_CLUSTER_UPDTCLUSTERSTATE_2, SQ_LOG_ERR, buf); |
| |
| if ( status[index].MPI_ERROR == MPI_ERR_EXITED ) |
| { // A monitor has gone away |
| |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_1, index); |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| MPI_Comm_free( &comms_[index] ); |
| break; |
| case CommType_Sockets: |
| shutdown( socks_[index], SHUT_RDWR ); |
| close( socks_[index] ); |
| socks_[index] = -1; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| --currentNodes_; |
| |
| // Clear bit in set of "up nodes" |
| upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK)); |
| |
| // Pretend node is still up until down node processing |
| // completes. |
| nodestate[index].node_state = State_Unknown; |
| nodestate[index].change_nid = -1; |
| nodestate[index].seq_num = 0; |
| for ( int i =0; i < MAX_NODE_MASKS ; i++ ) |
| { |
| nodestate[index].nodeMask.upNodes[i] = 0; |
| } |
| #ifdef NAMESERVER_PROCESS |
| nodestate[index].monConnCount = -1; |
| #else |
| nodestate[index].monProcCount = 0; |
| #endif |
| |
| if ( validateNodeDown_ ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf( "%s@%d Divergence, queueing " |
| "monExited{%d, %d, %lld}\n" |
| , method_name, __LINE__ |
| , index, MyPNID, seqNum_ ); |
| } |
| // Save info for the exited monitor so can confirm |
| // that all monitors have the same view. |
| monExited_t monExited = {index, MyPNID, seqNum_}; |
| exitedMons_.push_back( monExited ); |
| } |
| else |
| { |
| HandleDownNode(index); |
| } |
| } |
| } |
| } |
| |
| if ( validateNodeDown_ ) |
| ValidateClusterState( nodestate, clusterViewDivergence ); |
| |
| #ifndef NAMESERVER_PROCESS |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Node %s (pnid=%d) TmSyncState=(%d)(%s)\n" |
| , method_name, __LINE__ |
| , MyNode->GetName() |
| , MyPNID |
| , sendBuf->nodeInfo.tmSyncState |
| , SyncStateString( sendBuf->nodeInfo.tmSyncState )); |
| } |
| #endif |
| |
| // Update our node states |
| for (int index = 0; index < GetConfigPNodesMax(); index++) |
| { |
| node_state = (STATE)nodestate[index].node_state; |
| change_nid = nodestate[index].change_nid; |
| |
| if ( index == MyPNID && |
| MyNode->GetState() == State_Merged && seqNum_ == 1) |
| { // Initial "allgather" for this re-integrated monitor. |
| |
| seqNum_ = EnsureAndGetSeqNum(nodestate); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT)) |
| { |
| trace_printf("%s@%d Completed initial allgather for pnid=%d, " |
| "state=%d(%s), seqNum_=%lld\n", method_name, __LINE__, |
| index, MyNode->GetState(), |
| StateString(MyNode->GetState()), seqNum_ ); |
| } |
| |
| // Queue the node up request for processing by a |
| // worker thread. |
| ReqQueue.enqueueUpReq( MyPNID, NULL, -1 ); |
| } |
| |
| if ( change_nid == MyPNID ) |
| { |
| if( MyNode->GetState() == State_Down || |
| MyNode->GetState() == State_Merged || |
| MyNode->GetState() == State_Joining ) |
| { |
| if (trace_settings & TRACE_RECOVERY) |
| trace_printf( "%s@%d enqueueing node up, state=%s\n", |
| method_name, __LINE__, |
| StateString(MyNode->GetState()) ); |
| |
| // Queue the node up request for processing by a |
| // worker thread. |
| ReqQueue.enqueueUpReq( MyPNID, NULL, -1 ); |
| } |
| else |
| { // This node is being "downed" |
| |
| if (trace_settings & TRACE_RECOVERY) |
| trace_printf( "%s@%d enqueueing node down, state=%s\n", |
| method_name, __LINE__, |
| StateString(MyNode->GetState()) ); |
| |
| // Queue the node down request for processing by a |
| // worker thread. |
| ReqQueue.enqueueDownReq( MyPNID ); |
| } |
| } |
| else |
| { |
| // In a real cluster, existing monitors need to merge new |
| // monitor. |
| |
| CNode *pnode = change_nid != -1 ? Nodes->GetNode( change_nid ) : NULL; |
| #ifdef NAMESERVER_PROCESS |
| if ( change_nid != -1 && pnode ) |
| #else |
| if ( ! Emulate_Down && change_nid != -1 && pnode ) |
| #endif |
| { |
| switch ( pnode->GetState() ) |
| { |
| case State_Down: |
| if (trace_settings & TRACE_RECOVERY) |
| trace_printf( "%s@%d - change_nid=%d, state=%s, " |
| "queueing up request\n", |
| method_name, __LINE__ , change_nid, |
| StateString(pnode->GetState())); |
| |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_5, change_nid); |
| |
| // Queue the node up request for processing by a |
| // worker thread. |
| ReqQueue.enqueueUpReq( change_nid, |
| (char *)pnode->GetName(), |
| -1 ); |
| break; |
| case State_Merging: |
| if (trace_settings & TRACE_RECOVERY) |
| trace_printf( "%s@%d - change_nid=%d, state=%s, " |
| "queueing up request\n", |
| method_name, __LINE__ , change_nid, |
| StateString(pnode->GetState())); |
| |
| mem_log_write(CMonLog::MON_UPDATE_CLUSTER_6, change_nid); |
| |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| setNewComm(change_nid); |
| break; |
| case CommType_Sockets: |
| setNewSock(change_nid); |
| break; |
| default: |
| // Programmer bonehead! |
| MPI_Abort(MPI_COMM_SELF,99); |
| } |
| pnode->SetState( State_Merged ); |
| ReqQueue.enqueueUpReq( change_nid, |
| (char *)pnode->GetName(), |
| -1 ); |
| break; |
| |
| case State_Merged: |
| case State_Joining: |
| default: |
| if (trace_settings & TRACE_RECOVERY) |
| trace_printf( "%s@%d - change_nid=%d, state=%s, " |
| "no action required.\n", |
| method_name, __LINE__ , change_nid, |
| StateString( pnode->GetState() )); |
| break; |
| } |
| } |
| } |
| switch ( node_state ) |
| { |
| case State_Up: |
| case State_Joining: |
| case State_Merged: |
| case State_Merging: |
| case State_Initializing: |
| case State_Unlinked: |
| case State_Unknown: |
| break; |
| case State_Down: |
| doShutdown = true; |
| break; |
| case State_Stopped: |
| case State_Shutdown: |
| if (trace_settings & TRACE_SYNC_DETAIL) |
| trace_printf("%s@%d - Node %d is stopping.\n", method_name, __LINE__, index); |
| Node[index]->SetState( (STATE) node_state ); |
| doShutdown = true; |
| break; |
| default: |
| if (trace_settings & TRACE_SYNC) |
| trace_printf("%s@%d - Node %d in unknown state (%d).\n", |
| method_name, __LINE__, index, node_state); |
| } |
| } |
| |
| #ifdef NAMESERVER_PROCESS |
| // Update min monConnCount |
| int minConnCount = INT_MAX; |
| int minConnPnid = -1; |
| for (int index = 0; index < GetConfigPNodesMax(); index++) |
| { |
| int connCount = nodestate[index].monConnCount; |
| if ( ( connCount >= 0 ) && ( connCount < minConnCount ) ) |
| { |
| minConnPnid = index; |
| minConnCount = connCount; |
| } |
| } |
| myMonConnCount_ = nodestate[MyPNID].monConnCount; |
| minMonConnCount_ = minConnCount; |
| minMonConnPnid_ = minConnPnid; |
| #else |
| if (NameServerEnabled) |
| { |
| clusterProcCount_ = 0; |
| for (int index = 0; index < GetConfigPNodesMax(); index++) |
| { |
| clusterProcCount_ += nodestate[index].monProcCount; |
| } |
| } |
| #endif |
| |
| TRACE_EXIT; |
| } |
| |
| bool CCluster::ProcessClusterData( struct sync_buffer_def * syncBuf, |
| struct sync_buffer_def * sendBuf, |
| bool deferredTmSync ) |
| { |
| const char method_name[] = "CCluster::ProcessClusterData"; |
| TRACE_ENTRY; |
| |
| // Using the data returned from Allgather, process replication data |
| // from all nodes. If there are any TmSync messages from other |
| // nodes, defer processing until all other replicated data are |
| // processed. |
| struct internal_msg_def *msg; |
| struct sync_buffer_def *msgBuf; |
| bool haveDeferredTmSync = false; |
| |
| for (int i = 0; i < GetConfigPNodesMax(); i++) |
| { |
| bool noComm; |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| noComm = (comms_[i] == MPI_COMM_NULL) ? true : false; |
| break; |
| case CommType_Sockets: |
| noComm = (socks_[i] == -1) ? true : false; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| // Only process active nodes |
| if (noComm && i != MyPNID) continue; |
| |
| if ( i == MyPNID ) |
| { // Get pointer to message sent by this node |
| msgBuf = sendBuf; |
| } |
| else |
| { // Compute pointer to receive buffer element for node "i" |
| msgBuf = (struct sync_buffer_def *) |
| (((char *) syncBuf) + i * CommBufSize); |
| } |
| |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d - Buffer for node %d, swpRecCount_=%d, seq_num=%lld, " |
| "lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n", |
| method_name, __LINE__, i, swpRecCount_, |
| msgBuf->nodeInfo.seq_num, |
| lastSeqNum_, |
| msgBuf->msgInfo.msg_count, |
| msgBuf->msgInfo.msg_offset); |
| } |
| |
| // if we have already processed buffer, skip it |
| if (lastSeqNum_ >= msgBuf->nodeInfo.seq_num) continue; |
| |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d - Processing buffer for node %d, swpRecCount_=%d, seq_num=%lld, " |
| "lastSeqNum_=%lld, msg_count=%d, msg_offset=%d\n", |
| method_name, __LINE__, i, swpRecCount_, |
| msgBuf->nodeInfo.seq_num, |
| lastSeqNum_, |
| msgBuf->msgInfo.msg_count, |
| msgBuf->msgInfo.msg_offset); |
| } |
| |
| // reset msg length to zero to initialize for PopMsg() |
| msgBuf->msgInfo.msg_offset = 0; |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( msgBuf->msgInfo.msg_count == 1 |
| && (( internal_msg_def *)msgBuf->msg)->type == InternalType_Sync ) |
| { |
| if ( deferredTmSync ) |
| { // This node has sent a TmSync message. Process it now. |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Handling deferred TmSync messages for " |
| "node %d\n", method_name, __LINE__, i); |
| |
| struct internal_msg_def *msg; |
| msg = Nodes->PopMsg( msgBuf ); |
| |
| if ( i == MyPNID ) |
| HandleMyNodeMsg (msg, MyPNID); |
| else |
| HandleOtherNodeMsg (msg, i); |
| } |
| else |
| { |
| // This node has sent a TmSync message. Defer processing |
| // until we handle all of the non-TmSync messages from |
| // other nodes. |
| haveDeferredTmSync = true; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Deferring TmSync processing for node" |
| " %d until replicated data is handled\n", |
| method_name, __LINE__, i); |
| } |
| } |
| else if ( !deferredTmSync ) |
| #else |
| if ( !deferredTmSync ) |
| #endif |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf("%s@%d - Handling messages for " |
| "node %d\n", method_name, __LINE__, i); |
| do |
| { |
| // Get the next sync msg for the node |
| msg = Nodes->PopMsg( msgBuf ); |
| if (msg->type == InternalType_Null) break; |
| |
| if ( i == MyPNID ) |
| HandleMyNodeMsg (msg, MyPNID); |
| else |
| HandleOtherNodeMsg (msg, i); |
| } |
| while ( true ); |
| } |
| } |
| |
| TRACE_EXIT; |
| |
| return haveDeferredTmSync; |
| } |
| |
| bool CCluster::checkIfDone ( ) |
| { |
| const char method_name[] = "CCluster::checkIfDone"; |
| TRACE_ENTRY; |
| |
| int nameServerCount = 0; |
| CClusterConfig *clusterConfig = Nodes->GetClusterConfig(); |
| CNameServerConfigContainer *nameServerConfigContainer = NULL; |
| |
| #ifdef NAMESERVER_PROCESS |
| |
| if (clusterConfig) |
| { |
| nameServerConfigContainer = Nodes->GetNameServerConfig(); |
| if (nameServerConfigContainer) |
| { |
| nameServerCount = nameServerConfigContainer->GetCount(); |
| } |
| } |
| #else |
| int myNameServerCount = 0; |
| CNameServerConfig *nameServerConfig = NULL; |
| |
| if (NameServerEnabled && clusterConfig) |
| { |
| nameServerConfigContainer = Nodes->GetNameServerConfig(); |
| if (nameServerConfigContainer) |
| { |
| nameServerCount = nameServerConfigContainer->GetCount(); |
| if (IsRealCluster) |
| { |
| nameServerConfig = nameServerConfigContainer->GetConfig( Node_name ); |
| if (nameServerConfig) |
| { |
| myNameServerCount = 1; |
| } |
| } |
| else |
| { |
| if (nameServerCount && MyPNID < nameServerCount) |
| { |
| myNameServerCount = 1; |
| } |
| } |
| } |
| } |
| #endif |
| |
| #ifdef NAMESERVER_PROCESS |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Process " |
| "count=%d, internal state=%d, currentNodes_=%d, " |
| "local process count=%d, shutdownNameServer=%d, " |
| "nameServerCount=%d\n", |
| method_name, __LINE__, |
| MyNode->GetPNid(), |
| MyNode->GetShutdownLevel(), |
| StateString(MyNode->GetState()), |
| Nodes->ProcessCount(), |
| MyNode->getInternalState(), |
| currentNodes_, |
| MyNode->GetNumProcs(), |
| MyNode->IsShutdownNameServer(), |
| nameServerCount ); |
| |
| #else |
| if (NameServerEnabled) |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Cluster process " |
| "count=%d, internal state=%d, currentNodes_=%d, " |
| "local process count=%d\n", |
| method_name, __LINE__, MyNode->GetPNid(), |
| MyNode->GetShutdownLevel(), |
| StateString(MyNode->GetState()), |
| clusterProcCount_, |
| MyNode->getInternalState(), |
| currentNodes_, MyNode->GetNumProcs()); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Process " |
| "count=%d, internal state=%d, currentNodes_=%d, " |
| "local process count=%d\n", |
| method_name, __LINE__, MyNode->GetPNid(), |
| MyNode->GetShutdownLevel(), |
| StateString(MyNode->GetState()), |
| Nodes->ProcessCount(), |
| MyNode->getInternalState(), |
| currentNodes_, MyNode->GetNumProcs()); |
| } |
| #endif |
| // Check if we are also done |
| if (( MyNode->GetState() != State_Down ) && |
| ( MyNode->GetState() != State_Stopped ) ) |
| { |
| if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined ) |
| { |
| #ifdef NAMESERVER_PROCESS |
| if ( (Nodes->ProcessCount() <= nameServerCount ) // only Name Servers alive |
| && (MyNode->GetNumProcs() <= MAX_PRIMITIVES ) // only My Name Server alive |
| && MyNode->IsShutdownNameServer() // monitor shutdown Name Server received |
| && !MyNode->isInQuiesceState() ) // post-quiescing will |
| // expire WDG (cluster) |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Name Server signaled to exit.\n", method_name, __LINE__); |
| MyNode->SetState( State_Stopped ); |
| MyNode->SetInternalState(State_Ready_To_Exit); |
| |
| // we need to sync one more time so other nodes see our state |
| return false; |
| } |
| #else |
| if ( NameServerEnabled ) |
| { |
| |
| if ( clusterProcCount_ == 0 ) // all Name Servers exited |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Monitor signaled to exit.\n", method_name, __LINE__); |
| MyNode->SetState( State_Stopped ); |
| MyNode->SetInternalState(State_Ready_To_Exit); |
| |
| // we need to sync one more time so other nodes see our state |
| return false; |
| } |
| else if ( (clusterProcCount_ <= |
| (currentNodes_ * (MAX_PRIMITIVES+1)) ) // only WDGs and Name Servers alive |
| && (MyNode->GetNumProcs() <= |
| (MAX_PRIMITIVES+1) ) // only WDGs and Name Servers alive |
| && !MyNode->isInQuiesceState() // post-quiescing will |
| // expire WDG (cluster) |
| && !waitForWatchdogExit_ ) // WDG not yet exiting |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Stopping watchdog process. " |
| "(process count: cluster=%d, MyNode=%d)\n", |
| method_name, __LINE__, |
| Nodes->ProcessCount(), MyNode->ProcessCount()); |
| |
| waitForWatchdogExit_ = true; |
| // stop the watchdog timer first |
| HealthCheck.setState(MON_STOP_WATCHDOG); |
| // let the watchdog process exit |
| HealthCheck.setState(MON_EXIT_PRIMITIVES); |
| } |
| else if ( (MyNode->GetNumProcs() <= // only My Name Server alive |
| myNameServerCount ) |
| && !MyNode->isInQuiesceState() // post-quiescing will |
| // expire WDG (cluster) |
| && !waitForNameServerExit_ ) // Name Server not yet exiting |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Stopping Name Server process. " |
| "(process count: cluster=%d, MyNode=%d)\n", |
| method_name, __LINE__, |
| Nodes->ProcessCount(), MyNode->ProcessCount()); |
| |
| waitForNameServerExit_ = true; |
| int rc = NameServer->ProcessShutdown(); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Shutdown request to Name Server failed, node going down\n" |
| , method_name ); |
| mon_log_write( MON_CLUSTER_CHECKIFDONE_1, SQ_LOG_ERR, la_buf ); |
| ReqQueue.enqueueDownReq( MyPNID ); |
| } |
| } |
| } |
| else |
| { |
| if ( Nodes->ProcessCount() == 0 ) // all WDTs exited |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Monitor signaled to exit.\n", method_name, __LINE__); |
| MyNode->SetState( State_Stopped ); |
| MyNode->SetInternalState(State_Ready_To_Exit); |
| |
| // we need to sync one more time so other nodes see our state |
| return false; |
| } |
| else if ( (Nodes->ProcessCount() <= |
| (currentNodes_*MAX_PRIMITIVES)) // only WDGs alive |
| && !MyNode->isInQuiesceState() // post-quiescing will |
| // expire WDG (cluster) |
| && !waitForWatchdogExit_ ) // WDG not yet exiting |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - Stopping watchdog process.\n", |
| method_name, __LINE__); |
| |
| waitForWatchdogExit_ = true; |
| // stop the watchdog timer first |
| HealthCheck.setState(MON_STOP_WATCHDOG); |
| // let the watchdog process exit |
| HealthCheck.setState(MON_EXIT_PRIMITIVES); |
| } |
| } |
| #endif |
| } |
| } |
| else if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined |
| && MyNode->GetState() == State_Down |
| && MyNode->GetNumProcs() == 0) |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) |
| trace_printf("%s@%d - No processes remaining, monitor exiting.\n", |
| method_name, __LINE__); |
| |
| MyNode->SetState( State_Stopped ); |
| MyNode->SetInternalState(State_Ready_To_Exit); |
| // we need to sync one more time so other nodes see our state |
| return false; |
| } |
| |
| MyNode->CheckShutdownProcessing(); |
| |
| TRACE_EXIT; |
| |
| return ( MyNode->getInternalState() == State_Ready_To_Exit ); |
| } |
| |
| |
| // Gather "Allgather" performance statistics |
| // Given the beginning and ending time of an "Allgather" operation, compute |
| // the elapsed time and increment the count for the appropriate range |
| // bucket. |
| |
| const struct timespec CCluster::agBuckets_[] = { |
| {0, 0}, // lowest |
| {0, 20000}, // 20 us |
| {0, 50000}, // 50 us |
| {0, 500000}, // 500 us |
| {0, 1000000}, // 1 ms |
| {0, 10000000}, // 10 ms |
| {0, 25000000}, // 25 ms |
| {0, 50000000}, // 50 ms |
| {0, 100000000}, // 100 ms |
| {0, 500000000}}; // 500 ms |
| const int CCluster::agBucketsSize_ = sizeof(agBuckets_)/sizeof(timespec); |
| |
| bool CCluster::agTimeStats(struct timespec & ts_begin, |
| struct timespec & ts_end) |
| { |
| const char method_name[] = "CCluster::agTimeStats"; |
| bool slowAg = false; |
| |
| struct timespec timediff; |
| if ( (ts_end.tv_nsec - ts_begin.tv_nsec ) < 0 ) |
| { |
| timediff.tv_sec = ts_end.tv_sec - ts_begin.tv_sec - 1; |
| timediff.tv_nsec = 1000000000 + ts_end.tv_nsec - ts_begin.tv_nsec; |
| } |
| else |
| { |
| timediff.tv_sec = ts_end.tv_sec - ts_begin.tv_sec; |
| timediff.tv_nsec = ts_end.tv_nsec - ts_begin.tv_nsec; |
| } |
| |
| if ( timediff.tv_sec > agMaxElapsed_.tv_sec |
| || (timediff.tv_sec == agMaxElapsed_.tv_sec |
| && timediff.tv_nsec > agMaxElapsed_.tv_nsec )) |
| // Have a new maximum elapsed time |
| agMaxElapsed_ = timediff; |
| |
| if ( timediff.tv_sec < agMinElapsed_.tv_sec |
| || (timediff.tv_sec == agMinElapsed_.tv_sec |
| && timediff.tv_nsec < agMinElapsed_.tv_nsec )) |
| // Have a new minimum time |
| agMinElapsed_ = timediff; |
| |
| for (int i=agBucketsSize_-1; i>=0; --i) |
| { |
| if (timediff.tv_sec > agBuckets_[i].tv_sec |
| || (timediff.tv_sec == agBuckets_[i].tv_sec |
| && timediff.tv_nsec > agBuckets_[i].tv_nsec )) |
| { |
| ++agElapsed_[i]; |
| if (i >= 7) |
| { |
| slowAg = true; |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d slow Allgather=(%ld, %ld) seqNum_=%lld, i=%d\n", |
| method_name, __LINE__, |
| timediff.tv_sec, timediff.tv_nsec, seqNum_, i); |
| } |
| } |
| break; |
| } |
| } |
| |
| return slowAg; |
| } |
| |
| // Display "Allgather" statistics |
| void CCluster::stats() |
| { |
| const char method_name[] = "CCluster::stats"; |
| |
| trace_printf("%s@%d Allgather min elapsed=%ld.%ld\n", method_name, __LINE__, |
| agMinElapsed_.tv_sec, agMinElapsed_.tv_nsec); |
| |
| trace_printf("%s@%d Allgather max elapsed=%ld.%ld\n", method_name, __LINE__, |
| agMaxElapsed_.tv_sec, agMaxElapsed_.tv_nsec); |
| |
| unsigned long int bucket; |
| const char * unit; |
| const char * range; |
| for (int i=0; i<agBucketsSize_; ++i) |
| { |
| if ( i == (agBucketsSize_-1)) |
| { |
| bucket = agBuckets_[i].tv_nsec; |
| range = ">"; |
| } |
| else |
| { |
| bucket = agBuckets_[i+1].tv_nsec; |
| range = "<="; |
| } |
| bucket = bucket/1000; |
| if (bucket < 1000) |
| unit = "usec"; |
| else |
| { |
| bucket = bucket / 1000; |
| if ( bucket < 1000 ) |
| unit = "msec"; |
| else |
| unit = "???"; |
| } |
| trace_printf("%s@%d bucket[%d]=%d (%s %ld %s)\n", |
| method_name, __LINE__, i, agElapsed_[i], |
| range, bucket, unit); |
| } |
| } |
| |
| bool CCluster::exchangeNodeData ( ) |
| { |
| const char method_name[] = "CCluster::exchangeNodeData"; |
| TRACE_ENTRY; |
| |
| bool result = false; |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->req_sync_Incr(); |
| |
| ++swpRecCount_; // recursive count for this function |
| |
| bool doShutdown = false; |
| bool lastAllgatherWithLastSyncBuffer = false; |
| |
| struct internal_msg_def *msg; |
| MPI_Status status[GetConfigPNodesMax()]; |
| int err; |
| struct sync_buffer_def *recv_buffer; |
| struct sync_buffer_def *send_buffer = Nodes->GetSyncBuffer(); |
| unsigned long long savedSeqNum = 0; |
| |
| // if we are here in a second recursive call that occurred while |
| // processing TMSync data, use the second receive buffer |
| // else, use the first one. |
| if (swpRecCount_ == 1) |
| { |
| recv_buffer = recvBuffer_; |
| } |
| else |
| { |
| // should not be here in more than one recursive call. |
| assert(swpRecCount_ == 2); |
| recv_buffer = recvBuffer2_; |
| } |
| |
| // Initialize sync buffer header including node state |
| msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ ); |
| |
| // Fill sync buffer based on queue of replication requests |
| Replicator.FillSyncBuffer ( msg ); |
| |
| reconnected: |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - doing Allgather size=%d, swpRecCount_=%d, " |
| "message count=%d, message seq_num=%lld, " |
| "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, " |
| "highSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , Nodes->GetSyncSize() |
| , swpRecCount_ |
| , send_buffer->msgInfo.msg_count |
| , send_buffer->nodeInfo.seq_num |
| , seqNum_ |
| , lastSeqNum_ |
| , lowSeqNum_ |
| , highSeqNum_ |
| , reconnectSeqNum_); |
| |
| struct timespec ts_ag_begin; |
| clock_gettime(CLOCK_REALTIME, &ts_ag_begin); |
| |
| |
| // Exchange info with other nodes |
| err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer, |
| 0 /*seqNum_*/, status ); |
| |
| struct timespec ts_ag_end; |
| clock_gettime(CLOCK_REALTIME, &ts_ag_end); |
| |
| if (err != MPI_SUCCESS && err != MPI_ERR_IN_STATUS) |
| { |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d - unexpected Allgather error=%s (%d)\n", |
| method_name, __LINE__, ErrorMsg(err), err); |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unexpected MPI communications " |
| "error=%s (%d).\n", method_name, ErrorMsg(err), err); |
| mon_log_write(MON_CLUSTER_EXCHANGENODEDATA_1, SQ_LOG_ERR, buf); |
| |
| // Allgather() failed in a fundamental way, bring this node down |
| if ( !enqueuedDown_ ) |
| { |
| enqueuedDown_ = true; |
| ReqQueue.enqueueDownReq(MyPNID); |
| } |
| } |
| else |
| { |
| if (agTimeStats( ts_ag_begin, ts_ag_end)) |
| { // Slow cycle, print info |
| if ( trace_settings & TRACE_SYNC ) |
| { |
| trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n", |
| method_name, __LINE__, Nodes->GetSyncSize(), |
| send_buffer->msgInfo.msg_count, MyPNID); |
| struct sync_buffer_def *msgBuf; |
| int nr; |
| |
| for (int i = 0; i < GetConfigPNodesMax(); i++) |
| { |
| bool noComm; |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| noComm = (comms_[i] == MPI_COMM_NULL) ? true : false; |
| break; |
| case CommType_Sockets: |
| noComm = (socks_[i] == -1) ? true : false; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| // Only process active nodes |
| if (noComm) continue; |
| |
| msgBuf = (struct sync_buffer_def *) |
| (((char *) recv_buffer) + i * CommBufSize); |
| |
| MPI_Get_count(&status[i], MPI_CHAR, &nr); |
| |
| trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n", |
| method_name, __LINE__, i, nr, |
| msgBuf->msgInfo.msg_count, |
| msgBuf->msgInfo.msg_offset); |
| } |
| } |
| } |
| |
| UpdateClusterState( doShutdown |
| , recv_buffer |
| , status |
| , send_buffer->nodeInfo.change_nid); |
| |
| if ( lastAllgatherWithLastSyncBuffer ) |
| { |
| seqNum_ = savedSeqNum; |
| lastAllgatherWithLastSyncBuffer = false; |
| send_buffer = Nodes->GetSyncBuffer(); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Resetting lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| |
| if ( reconnectSeqNum_ != 0 ) |
| { |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Allgather IO retry, swpRecCount_=%d, " |
| "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, " |
| "highSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , swpRecCount_ |
| , seqNum_ |
| , lastSeqNum_ |
| , lowSeqNum_ |
| , highSeqNum_ |
| , reconnectSeqNum_); |
| |
| // The Allgather() has executed a reconnect at reconnectSeqNum_. |
| // The UpdateClusterState has set the lowSeqNum_and highSeqNum_ |
| // in the current IO exchange which will indicate whether there is |
| // a mismatch in IOs between monitor processes. If there is a mismatch, |
| // the lowSeqNum_and highSeqNum_ relative to our current seqNum_ |
| // will determine how to redrive the exchange of node data. |
| if (seqNum_ > lowSeqNum_) |
| { // A remote monitor did not receive our last SyncBuffer |
| // Redo exchange with the previous SyncBuffer |
| send_buffer = Nodes->GetLastSyncBuffer(); |
| savedSeqNum = seqNum_; |
| seqNum_ = lastSeqNum_; |
| // Indicate to follow up the next exchange with current SyncBuffer |
| lastAllgatherWithLastSyncBuffer = true; |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Setting lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| else if (seqNum_ < highSeqNum_) |
| { // The local monitor did not receive the last remote SyncBuffer |
| // Redo exchange with the current SyncBuffer |
| send_buffer = Nodes->GetSyncBuffer(); |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| } |
| } |
| |
| if ( ProcessClusterData( recv_buffer, send_buffer, false ) ) |
| { // There is a TmSync message remaining to be handled |
| ProcessClusterData( recv_buffer, send_buffer, true ); |
| } |
| |
| if (swpRecCount_ == 1) |
| { |
| // Save the sync buffer and corresponding sequence number we just processed |
| // On reconnect we must resend the last buffer and the current buffer |
| // to ensure dropped buffers are processed by all monitor processe in the |
| // correct order |
| Nodes->SaveMyLastSyncBuffer(); |
| lastSeqNum_ = seqNum_; |
| |
| // Increment count of "Allgather" calls. If wrap-around, start again at 1. |
| if ( ++seqNum_ == 0) seqNum_ = 1; |
| } |
| |
| // Wake up any threads waiting on the completion of a sync cycle |
| syncCycle_.wakeAll(); |
| |
| if (doShutdown) result = checkIfDone( ); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - node data exchange completed, swpRecCount_=%d, " |
| "seqNum_=%lld, lastSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , swpRecCount_ |
| , seqNum_ |
| , lastSeqNum_ |
| , reconnectSeqNum_); |
| |
| --swpRecCount_; |
| |
| TRACE_EXIT; |
| |
| return result; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| void CCluster::exchangeTmSyncData ( struct sync_def *sync, bool bumpSync ) |
| { |
| const char method_name[] = "CCluster::exchangeTmSyncData"; |
| TRACE_ENTRY; |
| |
| ++swpRecCount_; // recursive count for this function |
| |
| bool doShutdown = false; |
| bool lastAllgatherWithLastSyncBuffer = false; |
| |
| struct internal_msg_def *msg; |
| MPI_Status status[GetConfigPNodesMax()]; |
| int err; |
| struct sync_buffer_def *recv_buffer; |
| struct sync_buffer_def *send_buffer = Nodes->GetSyncBuffer(); |
| unsigned long long savedSeqNum = 0; |
| |
| // if we are here in a second recursive call that occurred while |
| // processing TMSync data, use the second receive buffer |
| // else, use the first one. |
| if (swpRecCount_ == 1) |
| { |
| recv_buffer = recvBuffer_; |
| } |
| else |
| { |
| // should not be here in more than one recursive call. |
| assert(swpRecCount_ == 2); |
| recv_buffer = recvBuffer2_; |
| } |
| |
| if (bumpSync) |
| { |
| // Save the sync buffer and corresponding sequence number we just processed |
| // On reconnect we must resend the last buffer and the current buffer |
| // to ensure dropped buffers are processed by all monitor processe in the |
| // correct order |
| Nodes->SaveMyLastSyncBuffer(); |
| lastSeqNum_ = seqNum_; |
| |
| // Increment count of "Allgather" calls. If wrap-around, start again at 1. |
| if ( ++seqNum_ == 0) seqNum_ = 1; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Bumping sequence number, " |
| "swpRecCount_=%d, seqNum_=%lld, lastSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , swpRecCount_ |
| , seqNum_ |
| , lastSeqNum_); |
| |
| } |
| |
| // Initialize sync buffer header including node state |
| msg = Nodes->InitSyncBuffer( send_buffer, seqNum_, upNodes_ ); |
| |
| // Add tmsync data |
| AddTmsyncMsg( send_buffer, sync, msg ); |
| |
| reconnected: |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - doing Allgather size=%d, swpRecCount_=%d, " |
| "message count=%d, message seq_num=%lld, " |
| "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, " |
| "highSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , Nodes->GetSyncSize() |
| , swpRecCount_ |
| , send_buffer->msgInfo.msg_count |
| , send_buffer->nodeInfo.seq_num |
| , seqNum_ |
| , lastSeqNum_ |
| , lowSeqNum_ |
| , highSeqNum_ |
| , reconnectSeqNum_); |
| |
| struct timespec ts_ag_begin; |
| clock_gettime(CLOCK_REALTIME, &ts_ag_begin); |
| |
| |
| // Exchange info with other nodes |
| err = Allgather(Nodes->GetSyncSize(), send_buffer, (char *)recv_buffer, |
| 0 /*seqNum_*/, status ); |
| |
| struct timespec ts_ag_end; |
| clock_gettime(CLOCK_REALTIME, &ts_ag_end); |
| |
| if (err != MPI_SUCCESS && err != MPI_ERR_IN_STATUS) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf("%s@%d - unexpected Allgather error=%s (%d)\n", |
| method_name, __LINE__, ErrorMsg(err), err); |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Unexpected MPI communications " |
| "error=%s (%d).\n", method_name, ErrorMsg(err), err); |
| mon_log_write(MON_CLUSTER_EXCHANGETMSYNC_1, SQ_LOG_ERR, buf); |
| |
| // Allgather() failed in a fundamental way, bring this node down |
| if ( !enqueuedDown_ ) |
| { |
| enqueuedDown_ = true; |
| ReqQueue.enqueueDownReq(MyPNID); |
| } |
| } |
| else |
| { |
| if (agTimeStats( ts_ag_begin, ts_ag_end)) |
| { // Slow cycle, print info |
| if ( trace_settings & TRACE_SYNC ) |
| { |
| trace_printf("%s@%d - slow Allgather info: sync size=%d, message count=%d, MyPNID=%d\n", |
| method_name, __LINE__, Nodes->GetSyncSize(), |
| send_buffer->msgInfo.msg_count, MyPNID); |
| struct sync_buffer_def *msgBuf; |
| int nr; |
| |
| for (int i = 0; i < GetConfigPNodesMax(); i++) |
| { |
| bool noComm; |
| switch( CommType ) |
| { |
| case CommType_InfiniBand: |
| noComm = (comms_[i] == MPI_COMM_NULL) ? true : false; |
| break; |
| case CommType_Sockets: |
| noComm = (socks_[i] == -1) ? true : false; |
| break; |
| default: |
| // Programmer bonehead! |
| abort(); |
| } |
| // Only process active nodes |
| if (noComm) continue; |
| |
| msgBuf = (struct sync_buffer_def *) |
| (((char *) recv_buffer) + i * CommBufSize); |
| |
| MPI_Get_count(&status[i], MPI_CHAR, &nr); |
| |
| trace_printf("%s@%d - slow Allgather info, pnid=%d: received bytes=%d, message count=%d, msg_offset=%d\n", |
| method_name, __LINE__, i, nr, |
| msgBuf->msgInfo.msg_count, |
| msgBuf->msgInfo.msg_offset); |
| } |
| } |
| } |
| |
| UpdateClusterState( doShutdown |
| , recv_buffer |
| , status |
| , send_buffer->nodeInfo.change_nid); |
| |
| if ( lastAllgatherWithLastSyncBuffer ) |
| { |
| seqNum_ = savedSeqNum; |
| lastAllgatherWithLastSyncBuffer = false; |
| send_buffer = Nodes->GetSyncBuffer(); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Resetting lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| |
| if ( reconnectSeqNum_ != 0 ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Allgather IO retry, swpRecCount_=%d, " |
| "seqNum_=%lld, lastSeqNum_=%lld, lowSeqNum_=%lld, " |
| "highSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , swpRecCount_ |
| , seqNum_ |
| , lastSeqNum_ |
| , lowSeqNum_ |
| , highSeqNum_ |
| , reconnectSeqNum_); |
| |
| // The Allgather() has executed a reconnect at reconnectSeqNum_. |
| // The UpdateClusterState has set the lowSeqNum_and highSeqNum_ |
| // in the current IO exchange which will indicate whether there is |
| // a mismatch in IOs between monitor processes. If there is a mismatch, |
| // the lowSeqNum_and highSeqNum_ relative to our current seqNum_ |
| // will determine how to redrive the exchange of node data. |
| if (seqNum_ > lowSeqNum_) |
| { // A remote monitor did not receive our last SyncBuffer |
| // Redo exchange with the previous SyncBuffer |
| send_buffer = Nodes->GetLastSyncBuffer(); |
| savedSeqNum = seqNum_; |
| seqNum_ = lastSeqNum_; |
| // Indicate to follow up the next exchange with current SyncBuffer |
| lastAllgatherWithLastSyncBuffer = true; |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - Setting lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| else if (seqNum_ < highSeqNum_) |
| { // The local monitor did not receive the last remote SyncBuffer |
| // Redo exchange with the current SyncBuffer |
| send_buffer = Nodes->GetSyncBuffer(); |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - lastAllgatherWithLastSyncBuffer=%d\n" |
| , method_name, __LINE__ |
| , lastAllgatherWithLastSyncBuffer); |
| |
| goto reconnected; |
| } |
| lowSeqNum_ = highSeqNum_ = reconnectSeqNum_ = 0; |
| } |
| } |
| |
| if ( ProcessClusterData( recv_buffer, send_buffer, false ) ) |
| { // There is a TmSync message remaining to be handled |
| ProcessClusterData( recv_buffer, send_buffer, true ); |
| } |
| |
| if (swpRecCount_ == 1) |
| { |
| // Save the sync buffer and corresponding sequence number we just processed |
| // On reconnect we must resend the last buffer and the current buffer |
| // to ensure dropped buffers are processed by all monitor processe in the |
| // correct order |
| Nodes->SaveMyLastSyncBuffer(); |
| lastSeqNum_ = seqNum_; |
| |
| // Increment count of "Allgather" calls. If wrap-around, start again at 1. |
| if ( ++seqNum_ == 0) seqNum_ = 1; |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC)) |
| trace_printf( "%s@%d - node data exchange completed, swpRecCount_=%d, " |
| "seqNum_=%lld, lastSeqNum_=%lld, reconnectSeqNum_=%lld\n" |
| , method_name, __LINE__ |
| , swpRecCount_ |
| , seqNum_ |
| , lastSeqNum_ |
| , reconnectSeqNum_); |
| |
| --swpRecCount_; |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CCluster::EpollCtl( int efd, int op, int fd, struct epoll_event *event ) |
| { |
| const char method_name[] = "CCluster::EpollCtl"; |
| TRACE_ENTRY; |
| #if 0 |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| int iPeer; |
| for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ ) |
| { // Find corresponding peer by matching socket fd |
| if ( fd == socks_[iPeer] ) break; |
| } |
| trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n" |
| , method_name, __LINE__ |
| , efd |
| , EpollOpString(op) |
| , fd, Node[iPeer]->GetName() |
| , EpollEventString(event->events) ); |
| } |
| #endif |
| int rc = epoll_ctl( efd, op, fd, event ); |
| if ( rc == -1 ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| int iPeer; |
| for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ ) |
| { // Find corresponding peer by matching socket fd |
| if ( fd == socks_[iPeer] ) break; |
| } |
| snprintf( buf, sizeof(buf), "[%s@%d] epoll_ctl(efd=%d,%s, fd=%d(%s), %s) error: %s\n" |
| , method_name, __LINE__ |
| , efd |
| , EpollOpString(op) |
| , fd, Node[iPeer]->GetName() |
| , EpollEventString(event->events) |
| , strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_EPOLLCTL_1, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| |
| TRACE_EXIT; |
| return; |
| } |
| |
| void CCluster::EpollCtlDelete( int efd, int fd, struct epoll_event *event ) |
| { |
| const char method_name[] = "CCluster::EpollCtlDelete"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| int iPeer; |
| for ( iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ ) |
| { // Find corresponding peer by matching socket fd |
| if ( fd == socks_[iPeer] ) break; |
| } |
| trace_printf( "%s@%d epoll_ctl( efd=%d,%s, fd=%d(%s), %s )\n" |
| , method_name, __LINE__ |
| , efd |
| , EpollOpString(EPOLL_CTL_DEL) |
| , fd, Node[iPeer]->GetName() |
| , EpollEventString(event->events) ); |
| } |
| |
| // Remove old socket from epoll set, it may not be there |
| int rc = epoll_ctl( efd, EPOLL_CTL_DEL, fd, event ); |
| if ( rc == -1 ) |
| { |
| int err = errno; |
| if (err != ENOENT) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] epoll_ctl(efd=%d, %s, fd=%d, %s) error: %s\n" |
| , method_name, __LINE__ |
| , efd |
| , EpollOpString(EPOLL_CTL_DEL) |
| , fd |
| , EpollEventString(event->events) |
| , strerror_r( err, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_EPOLLCTLDELETE_1, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| } |
| |
| TRACE_EXIT; |
| return; |
| } |
| |
| void CCluster::InitClusterSocks( int worldSize, int myRank, char *nodeNames, int *rankToPnid ) |
| { |
| const char method_name[] = "CCluster::InitClusterSocks"; |
| TRACE_ENTRY; |
| |
| int serverSyncPort; |
| CNode *node; |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d worldSize=%d, myRank=%d\n" |
| , method_name, __LINE__ |
| , worldSize, myRank); |
| } |
| |
| // Exchange ports with collective |
| serverSyncPort = MyNode->GetSyncSocketPort(); |
| int rc = MPI_Allgather( &serverSyncPort, 1, MPI_INT, |
| sockPorts_, 1, MPI_INT, MPI_COMM_WORLD ); |
| if ( rc != MPI_SUCCESS ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] MPI_Allgather error=%s\n", |
| method_name, __LINE__, ErrorMsg( rc ) ); |
| mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_3, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| #ifdef NAMESERVER_PROCESS |
| if ( !IsRealCluster ) |
| { |
| for ( int i = 0; i < worldSize; i++ ) |
| sockPorts_[i] = syncPort_ + i; |
| } |
| #endif |
| |
| char *n, nodeName[MPI_MAX_PROCESSOR_NAME]; |
| unsigned char srcaddr[4], dstaddr[4]; |
| struct hostent *he; |
| if ( nodeNames ) |
| { |
| n = &nodeNames[myRank*MPI_MAX_PROCESSOR_NAME]; |
| } |
| else |
| { |
| strcpy( nodeName, "localhost" ); |
| n = nodeName; |
| } |
| // Get my host structure via my node name or localhost |
| he = gethostbyname( n ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n", |
| method_name, __LINE__, n, strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_4, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| // Initialize my source address structure |
| memcpy( srcaddr, he->h_addr, 4 ); |
| int idst; |
| // Loop on each node in the cluster |
| for ( int i = 0; i < worldSize; i++ ) |
| { |
| // Loop on each adjacent node in the cluster |
| for ( int j = i+1; j < worldSize; j++ ) |
| { |
| if ( i == myRank ) |
| { // Current [i] node is my node, so connect to [j] node |
| idst = j; |
| if ( nodeNames ) |
| { // Real cluster |
| n = &nodeNames[j*MPI_MAX_PROCESSOR_NAME]; |
| // Get peer's host structure via its node name |
| he = gethostbyname( n ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), |
| "[%s@%d] gethostbyname(%s) error: %s\n", |
| method_name, __LINE__, n, |
| strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_5, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| // Initialize peer's destination address structure |
| memcpy( dstaddr, he->h_addr, 4 ); |
| node = Nodes->GetNode( n ); |
| if ( node ) |
| { // Save peer's port in its node object |
| node->SetSyncSocketPort(sockPorts_[j]); |
| } |
| } |
| else |
| { // Virtual cluster. Same source and destination addresses |
| node = NULL; |
| memcpy( dstaddr, srcaddr, 4 ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d Creating client socket: src=%d.%d.%d.%d, dst(%s)=%d.%d.%d.%d, dst port=%d\n" |
| , method_name, __LINE__ |
| , (int)((unsigned char *)srcaddr)[0] |
| , (int)((unsigned char *)srcaddr)[1] |
| , (int)((unsigned char *)srcaddr)[2] |
| , (int)((unsigned char *)srcaddr)[3] |
| , n |
| , (int)((unsigned char *)dstaddr)[0] |
| , (int)((unsigned char *)dstaddr)[1] |
| , (int)((unsigned char *)dstaddr)[2] |
| , (int)((unsigned char *)dstaddr)[3] |
| , sockPorts_[j] ); |
| } |
| // Connect to peer |
| socks_[rankToPnid[j]] = MkCltSock( srcaddr, dstaddr, sockPorts_[j] ); // InitClusterSocks |
| } |
| else if ( j == myRank ) |
| { // Current [j] peer my node, accept connection from peer [i] node |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d Accepting server socket: src=%d.%d.%d.%d, port=%d\n" |
| , method_name, __LINE__ |
| , (int)((unsigned char *)srcaddr)[0] |
| , (int)((unsigned char *)srcaddr)[1] |
| , (int)((unsigned char *)srcaddr)[2] |
| , (int)((unsigned char *)srcaddr)[3] |
| , serverSyncPort ); |
| } |
| |
| idst = i; |
| // Accept connection from peer [i] |
| socks_[rankToPnid[i]] = AcceptSock( syncSock_ ); // InitClusterSocks |
| } |
| else |
| { |
| idst = -1; |
| } |
| if ( idst >= 0 && socks_[rankToPnid[idst]] < 0 ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| if ( idst == i ) |
| { |
| snprintf( buf, sizeof(buf), "[%s@%d] mkcltsock src=%d.%d.%d.%d dst=%d.%d.%d.%d failed\n", |
| method_name, __LINE__, |
| (int)((unsigned char *)srcaddr)[0], |
| (int)((unsigned char *)srcaddr)[1], |
| (int)((unsigned char *)srcaddr)[2], |
| (int)((unsigned char *)srcaddr)[3], |
| (int)((unsigned char *)dstaddr)[0], |
| (int)((unsigned char *)dstaddr)[1], |
| (int)((unsigned char *)dstaddr)[2], |
| (int)((unsigned char *)dstaddr)[3] ); |
| } |
| else |
| { |
| snprintf( buf, sizeof(buf), "[%s@%d] acceptsock(%d) failed\n", |
| method_name, __LINE__, syncSock_ ); |
| } |
| mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_6, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| if ( idst >= 0 && fcntl( socks_[rankToPnid[idst]], F_SETFL, O_NONBLOCK ) ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] fcntl(NONBLOCK) error: %s\n", |
| method_name, __LINE__, strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITCLUSTERSOCKS_7, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| MPI_Barrier( MPI_COMM_WORLD ); |
| } |
| } |
| TRACE_EXIT; |
| } |
| |
| void CCluster::InitServerSock( void ) |
| { |
| const char method_name[] = "CCluster::InitServerSock"; |
| TRACE_ENTRY; |
| int serverCommPort = 0; |
| int serverSyncPort = 0; |
| #ifdef NAMESERVER_PROCESS |
| int mon2nsPort = 0; |
| #else |
| int ptpPort = 0; |
| #endif |
| int val = 0; |
| |
| unsigned char addr[4]; |
| struct hostent *he; |
| |
| he = gethostbyname( Node_name ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] gethostbyname(%s) error: %s\n" |
| , method_name, __LINE__ |
| , Node_name, strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_1, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| memcpy( addr, he->h_addr, 4 ); |
| |
| #ifdef NAMESERVER_PROCESS |
| char *env = getenv ("NS_COMM_PORT"); |
| #else |
| char *env = getenv("MONITOR_COMM_PORT"); |
| #endif |
| if ( env ) |
| { |
| val = atoi(env); |
| if ( val > 0) |
| { |
| if ( !IsRealCluster ) |
| { |
| val += MyPNID; |
| } |
| serverCommPort = val; |
| } |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d COMM_PORT Node_name=%s, env=%s, serverCommPort=%d, val=%d\n" |
| , method_name, __LINE__ |
| , Node_name, env, serverCommPort, val ); |
| } |
| |
| commSock_ = MkSrvSock( &serverCommPort ); |
| if ( commSock_ < 0 ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| #ifdef NAMESERVER_PROCESS |
| , "[%s@%d] MkSrvSock(NS_COMM_PORT=%d) error: %s\n" |
| #else |
| , "[%s@%d] MkSrvSock(MONITOR_COMM_PORT=%d) error: %s\n" |
| #endif |
| , method_name, __LINE__, serverCommPort |
| , strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_2, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| snprintf( MyCommPort, sizeof(MyCommPort) |
| , "%d.%d.%d.%d:%d" |
| , (int)((unsigned char *)addr)[0] |
| , (int)((unsigned char *)addr)[1] |
| , (int)((unsigned char *)addr)[2] |
| , (int)((unsigned char *)addr)[3] |
| , serverCommPort ); |
| MyNode->SetCommSocketPort( serverCommPort ); |
| MyNode->SetCommPort( MyCommPort ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d Initialized my comm socket port, " |
| "pnid=%d (%s:%s) (commPort=%s)\n" |
| , method_name, __LINE__ |
| , MyPNID, MyNode->GetName(), MyCommPort |
| , MyNode->GetCommPort() ); |
| |
| } |
| |
| #ifdef NAMESERVER_PROCESS |
| env = getenv("NS_SYNC_PORT"); |
| #else |
| env = getenv("MONITOR_SYNC_PORT"); |
| #endif |
| if ( env ) |
| { |
| val = atoi(env); |
| if ( val > 0) |
| { |
| if ( !IsRealCluster ) |
| { |
| val += MyPNID; |
| } |
| syncPort_ = serverSyncPort = val; |
| } |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d SYNC_PORT Node_name=%s, env=%s, serverSyncPort=%d, val=%d\n" |
| , method_name, __LINE__ |
| , Node_name, env, syncPort_, val ); |
| } |
| |
| syncSock_ = MkSrvSock( &serverSyncPort ); |
| if ( syncSock_ < 0 ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| #ifdef NAMESERVER_PROCESS |
| , "[%s@%d] MkSrvSock(NS_SYNC_PORT=%d) error: %s\n" |
| #else |
| , "[%s@%d] MkSrvSock(MONITOR_SYNC_PORT=%d) error: %s\n" |
| #endif |
| , method_name, __LINE__, serverSyncPort |
| , strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_3, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| snprintf( MySyncPort, sizeof(MySyncPort) |
| , "%d.%d.%d.%d:%d" |
| , (int)((unsigned char *)addr)[0] |
| , (int)((unsigned char *)addr)[1] |
| , (int)((unsigned char *)addr)[2] |
| , (int)((unsigned char *)addr)[3] |
| , serverSyncPort ); |
| MyNode->SetSyncSocketPort( serverSyncPort ); |
| MyNode->SetSyncPort( MySyncPort ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d Initialized my sync socket port, " |
| "pnid=%d (%s:%s) (syncPort=%s)\n" |
| , method_name, __LINE__ |
| , MyPNID, MyNode->GetName(), MySyncPort |
| , MyNode->GetSyncPort() ); |
| } |
| |
| #ifdef NAMESERVER_PROCESS |
| env = getenv("NS_M2N_COMM_PORT"); |
| if ( env ) |
| { |
| val = atoi(env); |
| if ( val > 0) |
| { |
| if ( !IsRealCluster ) |
| { |
| val += MyPNID; |
| } |
| mon2nsPort = val; |
| } |
| } |
| |
| mon2nsSock_ = MkSrvSock( &mon2nsPort ); |
| if ( mon2nsSock_ < 0 ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] MkSrvSock(NS_M2N_COMM_PORT=%d) error: %s\n" |
| , method_name, __LINE__, mon2nsPort |
| , strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_4, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| snprintf( MyMon2NsPort, sizeof(MyMon2NsPort) |
| , "%d.%d.%d.%d:%d" |
| , (int)((unsigned char *)addr)[0] |
| , (int)((unsigned char *)addr)[1] |
| , (int)((unsigned char *)addr)[2] |
| , (int)((unsigned char *)addr)[3] |
| , mon2nsPort ); |
| MyNode->SetMon2NsPort( MyMon2NsPort ); |
| MyNode->SetMon2NsSocketPort( mon2nsPort ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d Initialized my mon2ns comm socket port, " |
| "pnid=%d (%s:%s) (Mon2NsPort=%s, Mon2NsSocketPort=%d)\n" |
| , method_name, __LINE__ |
| , MyPNID, MyNode->GetName(), MyMon2NsPort |
| , MyNode->GetMon2NsPort() |
| , MyNode->GetMon2NsSocketPort() ); |
| |
| } |
| #else |
| if (NameServerEnabled) |
| { |
| env = getenv("MON2MON_COMM_PORT"); |
| if ( env ) |
| { |
| val = atoi(env); |
| if ( val > 0) |
| { |
| ptpPort = val; |
| } |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] MON2MON_COMM_PORT environment variable is not set!\n" |
| , method_name, __LINE__ ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_5, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| |
| // For virtual env, add PNid to the port so we can still test without collisions of port numbers |
| if (!IsRealCluster) |
| { |
| ptpPort += MyNode->GetPNid(); |
| } |
| |
| ptpSock_ = MkSrvSock( &ptpPort ); |
| if ( ptpSock_ < 0 ) |
| { |
| char ebuff[MON_STRING_BUF_SIZE]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s@%d] MkSrvSock(MON2MON_COMM_PORT=%d) error: %s\n" |
| , method_name, __LINE__, ptpPort |
| , strerror_r( errno, ebuff, MON_STRING_BUF_SIZE ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_6, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| else |
| { |
| snprintf( MyPtPPort, sizeof(MyPtPPort) |
| , "%d.%d.%d.%d:%d" |
| , (int)((unsigned char *)addr)[0] |
| , (int)((unsigned char *)addr)[1] |
| , (int)((unsigned char *)addr)[2] |
| , (int)((unsigned char *)addr)[3] |
| , ptpPort ); |
| MyNode->SetPtPPort( MyPtPPort ); |
| MyNode->SetPtPSocketPort( ptpPort ); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf( "%s@%d Initialized my ptp socket port, " |
| "pnid=%d (%s:%s) (ptpPort=%s)\n" |
| , method_name, __LINE__ |
| , MyPNID, MyNode->GetName(), MyPtPPort |
| , MyNode->GetPtPPort() ); |
| |
| } |
| } |
| #endif |
| |
| epollFD_ = epoll_create1( EPOLL_CLOEXEC ); |
| if ( epollFD_ < 0 ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] epoll_create1() error: %s\n", |
| method_name, __LINE__, strerror_r( errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_INITSERVERSOCK_7, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| int CCluster::AcceptCommSock( void ) |
| { |
| const char method_name[] = "CCluster::AcceptCommSock"; |
| TRACE_ENTRY; |
| |
| int csock = AcceptSock( commSock_ ); |
| |
| TRACE_EXIT; |
| return( csock ); |
| } |
| |
| int CCluster::AcceptSyncSock( void ) |
| { |
| const char method_name[] = "CCluster::AcceptSyncSock"; |
| TRACE_ENTRY; |
| |
| int csock = AcceptSock( syncSock_ ); |
| |
| TRACE_EXIT; |
| return( csock ); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| int CCluster::AcceptPtPSock( void ) |
| { |
| const char method_name[] = "CCluster::AcceptPtPSock"; |
| TRACE_ENTRY; |
| |
| int csock = AcceptSock( ptpSock_ ); |
| |
| TRACE_EXIT; |
| return( csock ); |
| } |
| #endif |
| |
| |
| int CCluster::AcceptSock( int sock ) |
| { |
| const char method_name[] = "CCluster::AcceptSock"; |
| TRACE_ENTRY; |
| |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| int size; // size of socket address |
| #endif |
| int csock; // connected socket |
| struct sockaddr_in sockinfo; // socket address info |
| |
| size = sizeof(struct sockaddr *); |
| if ( getsockname( sock, (struct sockaddr *) &sockinfo, &size ) ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| snprintf(buf, sizeof(buf), "[%s], getsockname() failed, errno=%d (%s).\n", |
| method_name, err, strerror(err)); |
| mon_log_write(MON_CLUSTER_ACCEPTSOCK_1, SQ_LOG_ERR, buf); |
| return ( -1 ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr; |
| trace_printf( "%s@%d - Accepting socket on addr=%d.%d.%d.%d, port=%d\n" |
| , method_name, __LINE__ |
| , addrp[0] |
| , addrp[1] |
| , addrp[2] |
| , addrp[3] |
| , (int) ntohs( sockinfo.sin_port ) ); |
| } |
| |
| while ( ((csock = accept( sock |
| , (struct sockaddr *) 0 |
| , (socklen_t *) 0 ) ) < 0) && (errno == EINTR) ); |
| |
| if ( csock > 0 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr; |
| trace_printf( "%s@%d - Accepted socket on addr=%d.%d.%d.%d, port=%d, sock=%d\n" |
| , method_name, __LINE__ |
| , addrp[0] |
| , addrp[1] |
| , addrp[2] |
| , addrp[3] |
| , (int) ntohs( sockinfo.sin_port ) |
| , csock ); |
| } |
| |
| int nodelay = 1; |
| if ( setsockopt( csock |
| , IPPROTO_TCP |
| , TCP_NODELAY |
| , (char *) &nodelay |
| , sizeof(int) ) ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| snprintf(buf, sizeof(buf), "[%s], setsockopt() failed, errno=%d (%s).\n", |
| method_name, err, strerror(err)); |
| mon_log_write(MON_CLUSTER_ACCEPTSOCK_2, SQ_LOG_ERR, buf); |
| return ( -2 ); |
| } |
| |
| int reuse = 1; |
| if ( setsockopt( csock |
| , SOL_SOCKET |
| , SO_REUSEADDR |
| , (char *) &reuse |
| , sizeof(int) ) ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| snprintf(buf, sizeof(buf), "[%s], setsockopt() failed, errno=%d (%s).\n", |
| method_name, err, strerror(err)); |
| mon_log_write(MON_CLUSTER_ACCEPTSOCK_3, SQ_LOG_ERR, buf); |
| return ( -2 ); |
| } |
| } |
| |
| TRACE_EXIT; |
| return ( csock ); |
| } |
| |
| int CCluster::Connect( const char *portName, bool doRetries ) |
| { |
| const char method_name[] = "CCluster::Connect"; |
| TRACE_ENTRY; |
| |
| int sock; // socket |
| int ret; // returned value |
| int nodelay = 1; // sockopt reuse option |
| int reuse = 1; // sockopt reuse option |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| int size; // size of socket address |
| #endif |
| static int retries = 0; // # times to retry connect |
| int outer_failures = 0; // # failed connect loops |
| int connect_failures = 0; // # failed connects |
| char *p; // getenv results |
| struct sockaddr_in sockinfo; // socket address info |
| struct hostent *he; |
| char host[1000]; |
| const char *colon; |
| unsigned int port; |
| |
| colon = strstr(portName, ":"); |
| strcpy(host, portName); |
| int len = colon - portName; |
| host[len] = '\0'; |
| port = atoi(&colon[1]); |
| size = sizeof(sockinfo); |
| |
| if ( !retries ) |
| { |
| p = getenv( "HPMP_CONNECT_RETRIES" ); |
| if ( p ) retries = atoi( p ); |
| else retries = 5; |
| } |
| |
| for ( ;; ) |
| { |
| sock = socket( AF_INET, SOCK_STREAM, 0 ); |
| if ( sock < 0 ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECT_1, SQ_LOG_CRIT, la_buf); |
| abort(); |
| } |
| |
| he = gethostbyname( host ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n", |
| method_name, __LINE__, host, strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_CONNECT_2, SQ_LOG_CRIT, buf ); |
| abort(); |
| } |
| |
| // Connect socket. |
| memset( (char *) &sockinfo, 0, size ); |
| memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_port = htons( (unsigned short) port ); |
| |
| // Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES, |
| // and has a yield between each retry, since it's more oriented |
| // toward failures from network overload and putting a pause |
| // between retries. This inner loop should only iterate when |
| // a signal interrupts the local process, so it doesn't pause |
| // or use the same HPMP_CONNECT_RETRIES count. |
| connect_failures = 0; |
| ret = 1; |
| while ( ret != 0 && connect_failures <= 10 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| if (doRetries) |
| { |
| trace_printf( "%s@%d - Connecting to %s, addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n" |
| , method_name, __LINE__ |
| , portName |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , port |
| , connect_failures ); |
| } |
| else |
| { |
| trace_printf( "%s@%d - Connecting to %s, addr=%d.%d.%d.%d, port=%d\n" |
| , method_name, __LINE__ |
| , portName |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , port ); |
| } |
| } |
| |
| ret = connect( sock, (struct sockaddr *) &sockinfo, size ); |
| if ( ret == 0 ) break; |
| if ( errno == EINTR ) |
| { |
| ++connect_failures; |
| } |
| #ifdef NAMESERVER_PROCESS |
| else if ( errno == ECONNREFUSED ) |
| { |
| ++connect_failures; |
| sleep( 1 ); |
| } |
| #endif |
| else |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], connect(%s) failed! errno=%d (%s)\n" |
| , method_name, portName, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECT_3, SQ_LOG_ERR, la_buf); |
| close(sock); |
| return ( -1 ); |
| } |
| } |
| |
| if ( ret == 0 ) break; |
| |
| if (doRetries == false) |
| { |
| close( sock ); |
| return( -1 ); |
| } |
| |
| // For large clusters, the connect/accept calls seem to fail occasionally, |
| // no doubt do to the large number (1000's) of simultaneous connect packets |
| // flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES |
| // number of times. |
| if ( errno != EINTR ) |
| { |
| if ( ++outer_failures > retries ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| sprintf( la_buf, "[%s], connect(%s) exceeded retries! count=%d\n" |
| , method_name, portName, retries); |
| mon_log_write(MON_CLUSTER_CONNECT_4, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| struct timespec req, rem; |
| req.tv_sec = 0; |
| req.tv_nsec = 500000; |
| nanosleep( &req, &rem ); |
| } |
| close( sock ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connected to %s addr=%d.%d.%d.%d, port=%d, sock=%d\n" |
| , method_name, __LINE__ |
| , host |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , port |
| , sock ); |
| } |
| |
| if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECT_5, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECT_6, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| TRACE_EXIT; |
| return ( sock ); |
| } |
| |
| #ifdef NAMESERVER_PROCESS |
| void CCluster::ConnectToMon2NsCommSelf( void ) |
| { |
| const char method_name[] = "CCluster::ConnectToMon2NsCommSelf"; |
| TRACE_ENTRY; |
| |
| Connect( MyNode->GetMon2NsSocketPort() ); |
| |
| TRACE_EXIT; |
| } |
| #else |
| void CCluster::ConnectToPtPCommSelf( void ) |
| { |
| const char method_name[] = "CCluster::ConnectToPtPCommSelf"; |
| TRACE_ENTRY; |
| |
| Connect( MyNode->GetPtPSocketPort() ); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CCluster::ConnectToSelf( void ) |
| { |
| const char method_name[] = "CCluster::ConnectToSelf"; |
| TRACE_ENTRY; |
| |
| Connect( MyNode->GetCommSocketPort() ); |
| |
| TRACE_EXIT; |
| } |
| |
| void CCluster::Connect( int socketPort ) |
| { |
| const char method_name[] = "CCluster::Connect"; |
| TRACE_ENTRY; |
| |
| int sock; // socket |
| int ret; // returned value |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| int size; // size of socket address |
| #endif |
| static int retries = 0; // # times to retry connect |
| int connect_failures = 0; // # failed connects |
| char *p; // getenv results |
| struct sockaddr_in sockinfo; // socket address info |
| struct hostent *he; |
| |
| size = sizeof(sockinfo); |
| |
| if ( !retries ) |
| { |
| p = getenv( "HPMP_CONNECT_RETRIES" ); |
| if ( p ) retries = atoi( p ); |
| else retries = 5; |
| } |
| |
| sock = socket( AF_INET, SOCK_STREAM, 0 ); |
| if ( sock < 0 ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECTTOSELF_1, SQ_LOG_CRIT, la_buf); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| |
| he = gethostbyname( "localhost" ); |
| if ( !he ) |
| { |
| char ebuff[256]; |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf), "[%s@%d] gethostbyname(%s) error: %s\n", |
| method_name, __LINE__, "localhost", strerror_r( h_errno, ebuff, 256 ) ); |
| mon_log_write( MON_CLUSTER_CONNECTTOSELF_2, SQ_LOG_CRIT, buf ); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| |
| // Connect socket. |
| memset( (char *) &sockinfo, 0, size ); |
| memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_port = htons( (unsigned short) socketPort ); |
| |
| connect_failures = 0; |
| ret = 1; |
| while ( ret != 0 && connect_failures <= 10 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connecting to localhost addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n" |
| , method_name, __LINE__ |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , socketPort |
| , connect_failures ); |
| } |
| |
| ret = connect( sock, (struct sockaddr *) &sockinfo, size ); |
| if ( ret == 0 ) break; |
| if ( errno == EINTR ) |
| { |
| ++connect_failures; |
| } |
| else |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_CONNECTTOSELF_3, SQ_LOG_CRIT, la_buf); |
| MPI_Abort( MPI_COMM_SELF,99 ); |
| } |
| } |
| |
| close( sock ); |
| |
| TRACE_EXIT; |
| } |
| |
| int CCluster::MkSrvSock( int *pport ) |
| { |
| const char method_name[] = "CCluster::MkSrvSock"; |
| TRACE_ENTRY; |
| |
| int sock; // socket |
| int err; // return code |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| unsigned int size; // size of socket address |
| #endif |
| struct sockaddr_in sockinfo; // socket address info |
| sock = socket( AF_INET, SOCK_STREAM, 0 ); |
| if ( sock < 0 ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], socket() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_1, SQ_LOG_CRIT, la_buf); |
| return ( -1 ); |
| } |
| |
| int nodelay = 1; // sockopt nodelay option |
| if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_2, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| int reuse = 1; // sockopt reuse option |
| if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt(SO_REUSEADDR) failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_3, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| |
| // Bind socket. |
| size = sizeof(sockinfo); |
| memset( (char *) &sockinfo, 0, size ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_addr.s_addr = htonl( INADDR_ANY ); |
| sockinfo.sin_port = htons( *pport ); |
| int lv_bind_tries = 0; |
| do |
| { |
| if (lv_bind_tries > 0) |
| { |
| sleep(5); |
| } |
| err = bind( sock, (struct sockaddr *) &sockinfo, size ); |
| sched_yield( ); |
| } while ( err && |
| (errno == EADDRINUSE) && |
| (++lv_bind_tries < 4) ); |
| if ( err ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], bind() failed! port=%d, errno=%d (%s)\n" |
| , method_name, *pport, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_4, SQ_LOG_CRIT, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| if ( pport ) |
| { |
| if ( getsockname( sock, (struct sockaddr *) &sockinfo, &size ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], getsockname() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_5, SQ_LOG_CRIT, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| |
| *pport = (int) ntohs( sockinfo.sin_port ); |
| } |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| unsigned char *addrp = (unsigned char *) &sockinfo.sin_addr.s_addr; |
| trace_printf( "%s@%d listening on addr=%d.%d.%d.%d, port=%d\n" |
| , method_name, __LINE__ |
| , addrp[0] |
| , addrp[1] |
| , addrp[2] |
| , addrp[3] |
| , pport?*pport:0); |
| } |
| |
| int lv_retcode = SetKeepAliveSockOpt( sock ); |
| if ( lv_retcode != 0 ) |
| { |
| return lv_retcode; |
| } |
| |
| // Listen |
| if ( listen( sock, SOMAXCONN ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], listen() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKSRVSOCK_6, SQ_LOG_CRIT, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| TRACE_EXIT; |
| return ( sock ); |
| } |
| |
| int CCluster::MkCltSock( const char *portName ) |
| { |
| const char method_name[] = "CCluster::MkCltSock1"; |
| TRACE_ENTRY; |
| |
| int sock; // socket |
| int ret; // returned value |
| int reuse = 1; // sockopt reuse option |
| int nodelay = 1; // sockopt nodelay option |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| int size; // size of socket address |
| #endif |
| static int retries = 0; // # times to retry connect |
| int outer_failures = 0; // # failed connect loops |
| int connect_failures = 0; // # failed connects |
| char *p; // getenv results |
| struct sockaddr_in sockinfo; // socket address info |
| struct hostent *he; |
| char host[1000]; |
| const char *colon; |
| unsigned int port; |
| |
| colon = strstr(portName, ":"); |
| strcpy(host, portName); |
| int len = colon - portName; |
| host[len] = '\0'; |
| port = atoi(&colon[1]); |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connecting to %s:%d\n" |
| , method_name, __LINE__ |
| , host |
| , port ); |
| } |
| } |
| |
| size = sizeof(sockinfo); |
| |
| if ( !retries ) |
| { |
| p = getenv( "HPMP_CONNECT_RETRIES" ); |
| if ( p ) retries = atoi( p ); |
| else retries = 5; |
| } |
| |
| for ( ;; ) |
| { |
| sock = socket( AF_INET, SOCK_STREAM, 0 ); |
| if ( sock < 0 ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s], socket() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_1, SQ_LOG_ERR, la_buf); |
| return ( -1 ); |
| } |
| |
| he = gethostbyname( host ); |
| if ( !he ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = h_errno; |
| snprintf( la_buf, sizeof(la_buf), |
| "[%s] gethostbyname(%s) failed! errno=%d (%s)\n" |
| , method_name, host, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_2, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| |
| // Connect socket. |
| memset( (char *) &sockinfo, 0, size ); |
| memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_port = htons( (unsigned short) port ); |
| |
| // Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES, |
| // and has a yield between each retry, since it's more oriented |
| // toward failures from network overload and putting a pause |
| // between retries. This inner loop should only iterate when |
| // a signal interrupts the local process, so it doesn't pause |
| // or use the same HPMP_CONNECT_RETRIES count. |
| connect_failures = 0; |
| ret = 1; |
| while ( ret != 0 && connect_failures <= 10 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connecting to %s addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n" |
| , method_name, __LINE__ |
| , host |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , port |
| , connect_failures ); |
| } |
| |
| ret = connect( sock, (struct sockaddr *) &sockinfo, size ); |
| if ( ret == 0 ) break; |
| if ( errno == EINTR ) |
| { |
| ++connect_failures; |
| } |
| else |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_3, SQ_LOG_ERR, la_buf); |
| close(sock); |
| return ( -1 ); |
| } |
| } |
| |
| if ( ret == 0 ) break; |
| |
| // For large clusters, the connect/accept calls seem to fail occasionally, |
| // no doubt do to the large number (1000's) of simultaneous connect packets |
| // flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES |
| // number of times. |
| if ( errno != EINTR ) |
| { |
| if ( ++outer_failures > retries ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n" |
| , method_name, retries); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_4, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| struct timespec req, rem; |
| req.tv_sec = 0; |
| req.tv_nsec = 500000; |
| nanosleep( &req, &rem ); |
| } |
| close( sock ); |
| } |
| |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connected to %s addr=%d.%d.%d.%d, port=%d, sock=%d\n" |
| , method_name, __LINE__ |
| , host |
| , (int)((unsigned char *)he->h_addr)[0] |
| , (int)((unsigned char *)he->h_addr)[1] |
| , (int)((unsigned char *)he->h_addr)[2] |
| , (int)((unsigned char *)he->h_addr)[3] |
| , port |
| , sock ); |
| } |
| |
| if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_5, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_6, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| TRACE_EXIT; |
| return ( sock ); |
| } |
| |
| int CCluster::SetKeepAliveSockOpt( int sock ) |
| { |
| const char method_name[] = "CCluster::SetKeepAliveSockOpt"; |
| TRACE_ENTRY; |
| |
| static int sv_keepalive = -1; |
| static int sv_keepidle = 120; |
| static int sv_keepintvl = 12; |
| static int sv_keepcnt = 5; |
| |
| if ( sv_keepalive == -1 ) |
| { |
| char *lv_keepalive_env = getenv( "SQ_MON_KEEPALIVE" ); |
| if ( lv_keepalive_env ) |
| { |
| sv_keepalive = atoi( lv_keepalive_env ); |
| } |
| if ( sv_keepalive == 1 ) |
| { |
| char *lv_keepidle_env = getenv( "SQ_MON_KEEPIDLE" ); |
| if ( lv_keepidle_env ) |
| { |
| sv_keepidle = atoi( lv_keepidle_env ); |
| } |
| char *lv_keepintvl_env = getenv( "SQ_MON_KEEPINTVL" ); |
| if ( lv_keepintvl_env ) |
| { |
| sv_keepintvl = atoi( lv_keepintvl_env ); |
| } |
| char *lv_keepcnt_env = getenv( "SQ_MON_KEEPCNT" ); |
| if ( lv_keepcnt_env ) |
| { |
| sv_keepcnt = atoi( lv_keepcnt_env ); |
| } |
| } |
| } |
| |
| if ( sv_keepalive == 1 ) |
| { |
| if ( setsockopt( sock, SOL_SOCKET, SO_KEEPALIVE, &sv_keepalive, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt so_keepalive() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err ) ); |
| mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_1, SQ_LOG_ERR, la_buf ); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_TCP, TCP_KEEPIDLE, &sv_keepidle, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt tcp_keepidle() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err ) ); |
| mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_2, SQ_LOG_ERR, la_buf ); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_TCP, TCP_KEEPINTVL, &sv_keepintvl, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt tcp_keepintvl() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err ) ); |
| mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_3, SQ_LOG_ERR, la_buf ); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_TCP, TCP_KEEPCNT, &sv_keepcnt, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt tcp_keepcnt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err ) ); |
| mon_log_write( MON_CLUSTER_SETKEEPALIVESOCKOPT_4, SQ_LOG_ERR, la_buf ); |
| close( sock ); |
| return ( -2 ); |
| } |
| } |
| |
| TRACE_EXIT; |
| return ( 0 ); |
| } |
| |
| int CCluster::MkCltSock( unsigned char srcip[4], unsigned char dstip[4], int port ) |
| { |
| const char method_name[] = "CCluster::MkCltSock2"; |
| TRACE_ENTRY; |
| |
| int sock; // socket |
| int ret; // returned value |
| int reuse = 1; // sockopt reuse option |
| int nodelay = 1; // sockopt nodelay option |
| #if defined(_XOPEN_SOURCE_EXTENDED) |
| #ifdef __LP64__ |
| socklen_t size; // size of socket address |
| #else |
| size_t size; // size of socket address |
| #endif |
| #else |
| int size; // size of socket address |
| #endif |
| static int retries = 0; // # times to retry connect |
| int outer_failures = 0; // # failed connect loops |
| int connect_failures = 0; // # failed connects |
| char *p; // getenv results |
| struct sockaddr_in sockinfo; // socket address info |
| |
| size = sizeof(sockinfo); |
| |
| if ( !retries ) |
| { |
| p = getenv( "HPMP_CONNECT_RETRIES" ); |
| if ( p ) retries = atoi( p ); |
| else retries = 5; |
| } |
| |
| for ( ;; ) |
| { |
| sock = socket( AF_INET, SOCK_STREAM, 0 ); |
| if ( sock < 0 ) return ( -1 ); |
| |
| // Bind local address if specified. |
| if ( srcip ) |
| { |
| memset( (char *) &sockinfo, 0, size ); |
| memcpy( (char *) &sockinfo.sin_addr, |
| (char *) srcip, sizeof(srcip) ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_port = 0; |
| if ( bind( sock, (struct sockaddr *) &sockinfo, size ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], bind() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_7, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| } |
| |
| // Connect socket. |
| memset( (char *) &sockinfo, 0, size ); |
| memcpy( (char *) &sockinfo.sin_addr, (char *) dstip, 4 ); |
| sockinfo.sin_family = AF_INET; |
| sockinfo.sin_port = htons( (unsigned short) port ); |
| |
| // Note the outer loop uses "retries" from HPMP_CONNECT_RETRIES, |
| // and has a yield between each retry, since it's more oriented |
| // toward failures from network overload and putting a pause |
| // between retries. This inner loop should only iterate when |
| // a signal interrupts the local process, so it doesn't pause |
| // or use the same HPMP_CONNECT_RETRIES count. |
| connect_failures = 0; |
| ret = 1; |
| while ( ret != 0 && connect_failures <= 10 ) |
| { |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - Connecting to addr=%d.%d.%d.%d, port=%d, connect_failures=%d\n" |
| , method_name, __LINE__ |
| , (int)dstip[0] |
| , (int)dstip[1] |
| , (int)dstip[2] |
| , (int)dstip[3] |
| , port |
| , connect_failures ); |
| } |
| ret = connect( sock, (struct sockaddr *) &sockinfo, |
| size ); |
| if ( ret == 0 ) break; |
| if ( errno == EINTR ) |
| { |
| ++connect_failures; |
| } |
| #ifdef NAMESERVER_PROCESS |
| else if ( errno == ECONNREFUSED ) |
| { |
| ++connect_failures; |
| sleep( 1 ); |
| } |
| #endif |
| else |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], connect(%d.%d.%d.%d:%d) failed! errno=%d (%s)\n" |
| , method_name |
| , (int)((unsigned char *)dstip)[0] |
| , (int)((unsigned char *)dstip)[1] |
| , (int)((unsigned char *)dstip)[2] |
| , (int)((unsigned char *)dstip)[3] |
| , port |
| , err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_8, SQ_LOG_ERR, la_buf); |
| close(sock); |
| return ( -1 ); |
| } |
| } |
| |
| if ( ret == 0 ) break; |
| |
| // For large clusters, the connect/accept calls seem to fail occasionally, |
| // no doubt do to the large number (1000's) of simultaneous connect packets |
| // flooding the network at once. So, we retry up to HPMP_CONNECT_RETRIES |
| // number of times. |
| if ( errno != EINTR ) |
| { |
| if ( ++outer_failures > retries ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n" |
| , method_name, retries); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_9, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -1 ); |
| } |
| struct timespec req, rem; |
| req.tv_sec = 0; |
| req.tv_nsec = 500000; |
| nanosleep( &req, &rem ); |
| } |
| close( sock ); |
| } |
| |
| if ( setsockopt( sock, IPPROTO_TCP, TCP_NODELAY, (char *) &nodelay, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_10, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" |
| , method_name, err, strerror( err )); |
| mon_log_write(MON_CLUSTER_MKCLTSOCK_11, SQ_LOG_ERR, la_buf); |
| close( sock ); |
| return ( -2 ); |
| } |
| |
| int lv_retcode = SetKeepAliveSockOpt( sock ); |
| if ( lv_retcode != 0 ) |
| { |
| return lv_retcode; |
| } |
| |
| TRACE_EXIT; |
| return ( sock ); |
| } |
| |
| int CCluster::ReceiveMPI(char *buf, int size, int source, MonXChngTags tag, MPI_Comm comm) |
| { |
| const char method_name[] = "CCluster::ReceiveMPI"; |
| TRACE_ENTRY; |
| |
| MPI_Request request; |
| MPI_Status status; |
| int received = 0; |
| |
| int error = MPI_Irecv(buf, size, MPI_CHAR, source, tag, comm, &request); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Msg Received. Error = %d\n", method_name, __LINE__, error); |
| |
| if (!error) |
| { |
| while (!received) |
| { |
| error = MPI_Test(&request, &received, &status); |
| |
| if (!error) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Msg Received Test. Flag = %d\n", method_name, __LINE__, received); |
| } |
| else |
| { |
| usleep(10000); // sleep 10ms and try again |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| return error; |
| } |
| |
| int CCluster::SendMPI(char *buf, int size, int source, MonXChngTags tag, MPI_Comm comm) |
| { |
| const char method_name[] = "CCluster::SendMPI"; |
| TRACE_ENTRY; |
| |
| MPI_Request request; |
| MPI_Status status; |
| int sent = 0; |
| |
| int error = MPI_Isend(buf, size, MPI_CHAR, source, tag, comm, &request); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Msg Sent. Error = %d\n", method_name, __LINE__, error); |
| |
| if (!error) |
| { |
| while (!sent) |
| { |
| error = MPI_Test(&request, &sent, &status); |
| |
| if (!error) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| trace_printf("%s@%d - Msg Sent Test. Flag = %d\n", method_name, __LINE__, sent); |
| } |
| else |
| { |
| usleep(10000); // sleep 10ms and try again |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| return error; |
| } |
| |
| int CCluster::ReceiveSock(char *buf, int size, int sockFd, const char *desc) |
| { |
| const char method_name[] = "CCluster::ReceiveSock"; |
| TRACE_ENTRY; |
| |
| bool readAgain = false; |
| int error = 0; |
| int readCount = 0; |
| int received = 0; |
| int sizeCount = size; |
| |
| do |
| { |
| readCount = (int) recv( sockFd |
| , buf |
| , sizeCount |
| , 0 ); |
| if ( readCount > 0 ) Meas.addSockRcvdBytes( readCount ); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - recv(%d), sock=%d, readCount=%d, desc=%s\n" |
| , method_name, __LINE__ |
| , sizeCount |
| , sockFd |
| , readCount |
| , desc ); |
| } |
| |
| if ( readCount > 0 ) |
| { // Got data |
| received += readCount; |
| buf += readCount; |
| if ( received == size ) |
| { |
| readAgain = false; |
| } |
| else |
| { |
| sizeCount -= readCount; |
| readAgain = true; |
| } |
| } |
| else if ( readCount == 0 ) |
| { // EOF |
| error = ENODATA; |
| readAgain = false; |
| } |
| else |
| { // Got an error |
| if ( errno != EINTR) |
| { |
| error = errno; |
| readAgain = false; |
| } |
| else |
| { |
| readAgain = true; |
| } |
| } |
| } |
| while( readAgain ); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - recv(), received=%d, sock=%d, error=%d(%s), desc=%s\n" |
| , method_name, __LINE__ |
| , received |
| , sockFd |
| , error, strerror(error) |
| , desc ); |
| } |
| |
| TRACE_EXIT; |
| return error; |
| } |
| |
| int CCluster::SendSock(char *buf, int size, int sockFd, const char *desc) |
| { |
| const char method_name[] = "CCluster::SendSock"; |
| TRACE_ENTRY; |
| |
| bool sendAgain = false; |
| int error = 0; |
| int sendCount = 0; |
| int sent = 0; |
| |
| do |
| { |
| sendCount = (int) send( sockFd |
| , buf |
| , size |
| , 0 ); |
| if ( sendCount > 0 ) Meas.addSockSentBytes( sendCount ); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - send(), sock=%d, sendCount=%d, desc=%s\n" |
| , method_name, __LINE__ |
| , sockFd |
| , sendCount |
| , desc ); |
| } |
| |
| if ( sendCount > 0 ) |
| { // Sent data |
| sent += sendCount; |
| if ( sendCount == size ) |
| { |
| sendAgain = false; |
| } |
| else |
| { |
| sendAgain = true; |
| } |
| } |
| else |
| { // Got an error |
| if ( errno != EINTR) |
| { |
| error = errno; |
| sendAgain = false; |
| } |
| else |
| { |
| sendAgain = true; |
| } |
| } |
| } |
| while( sendAgain ); |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) |
| { |
| trace_printf( "%s@%d - send(), sent=%d, sock=%d, error=%d(%s), desc=%s\n" |
| , method_name, __LINE__ |
| , sent |
| , sockFd |
| , error, strerror(error) |
| , desc ); |
| } |
| |
| TRACE_EXIT; |
| return error; |
| } |