| /////////////////////////////////////////////////////////////////////////////// |
| // |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| // |
| /////////////////////////////////////////////////////////////////////////////// |
| |
| #include <iostream> |
| |
| using namespace std; |
| |
| #include <fcntl.h> |
| #include <semaphore.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| #include <dirent.h> |
| #include <sys/file.h> |
| #include <sys/ipc.h> |
| #include <sys/ptrace.h> |
| #include <sys/resource.h> |
| #include <sys/stat.h> |
| #include <sys/time.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <errno.h> |
| #include <sys/epoll.h> |
| |
| #include "props.h" |
| #include "localio.h" |
| #include "mlio.h" |
| #include "monlogging.h" |
| #ifdef USE_FORK_SUSPEND_RESUME |
| #include "monrs.h" |
| #endif // USE_FORK_SUSPEND_RESUME |
| #include "monsonar.h" |
| #include "montrace.h" |
| #include "redirector.h" |
| #include "healthcheck.h" |
| #include "lock.h" |
| #include "config.h" |
| #include "device.h" |
| #include "monitor.h" |
| #include "msgdef.h" |
| #include "clusterconf.h" |
| #include "lnode.h" |
| #include "pnode.h" |
| #include "process.h" |
| #include "intprocess.h" |
| #include "gentrap.h" |
| #include "nameserver.h" |
| |
| #include "reqqueue.h" |
| extern CReqQueue ReqQueue; |
| |
| #include "replicate.h" |
| |
| #ifndef NAMESERVER_PROCESS |
| #include "ptpclient.h" |
| #endif |
| |
| extern bool IsAgentMode; |
| extern bool IsMaster; |
| |
| extern bool PidMap; |
| extern int Measure; |
| extern int trace_level; |
| extern int MyPNID; |
| extern char MyCommPort[MPI_MAX_PORT_NAME]; |
| extern char Node_name[MPI_MAX_PROCESSOR_NAME]; |
| extern sigset_t SigSet; |
| extern CLock MemModLock; |
| extern CMonitor *Monitor; |
| #ifndef NAMESERVER_PROCESS |
| extern bool NameServerEnabled; |
| extern CNameServer *NameServer; |
| extern CPtpClient *PtpClient; |
| #endif |
| extern CNodeContainer *Nodes; |
| extern CConfigContainer *Config; |
| #ifndef NAMESERVER_PROCESS |
| extern CDeviceContainer *Devices; |
| #endif |
| extern CNode *MyNode; |
| extern CMonStats *MonStats; |
| #ifndef NAMESERVER_PROCESS |
| extern CRedirector Redirector; |
| #endif |
| extern CHealthCheck HealthCheck; |
| extern CReplicate Replicator; |
| extern CIntProcess IntProcess; |
| |
| extern char *ErrorMsg (int error_code); |
| extern _TM_Txid_External invalid_trans( void ); |
| extern _TM_Txid_External null_trans( void ); |
| extern bool isEqual( _TM_Txid_External trans1, _TM_Txid_External trans2 ); |
| extern bool isNull( _TM_Txid_External transid ); |
| extern bool isInvalid( _TM_Txid_External transid ); |
| |
| extern bool IAmIntegrated; |
| extern bool SMSIntegrating; |
| |
| extern const char *NodePhaseString( NodePhase phase ); |
| extern const char *ProcessTypeString( PROCESSTYPE type ); |
| |
| extern int monitorArgc; |
| extern char monitorArgv[MAX_ARGS][MAX_ARG_SIZE]; |
| |
| CProcess::CProcess (CProcess * parent, int nid, int pid, |
| #ifdef NAMESERVER_PROCESS |
| Verifier_t verifier, |
| #endif |
| PROCESSTYPE type, |
| int priority, int backup, bool debug, bool unhooked, |
| char *name, |
| #ifdef NAMESERVER_PROCESS |
| char *path, |
| char *ldpath, |
| char *program, |
| #else |
| strId_t pathStrId, strId_t ldpathStrId, strId_t programStrId, |
| #endif |
| char *infile, char *outfile) |
| : |
| Nid (nid), |
| Pid (pid), |
| #ifdef NAMESERVER_PROCESS |
| Verifier ( verifier ), |
| #else |
| Verifier ( -1 ), |
| #endif |
| PidAtFork_ (pid), |
| Type (type), |
| Event_messages (false), |
| System_messages (false), |
| Paired (false), |
| Clone (false), |
| Debug(debug), |
| DeletePending (false), |
| StartupCompleted (false), |
| Backup (backup), |
| Abended (false), |
| Attached (false), |
| abort_(false), |
| Persistent (false), |
| UnHooked (unhooked), |
| Nowait (false), |
| PersistentCreateTime (0), |
| PersistentRetries (0), |
| Tag ( 0 ), |
| Parent (parent), |
| PairParentNid (-1), |
| PairParentPid (-1), |
| PairParentVerifier (-1), |
| ReplyTag (REPLY_TAG), // will be set again when we have a pending reply |
| OpenedCount (0), |
| LastNid (nid), |
| DumpState (Dump_Ready), |
| DumpStatus (Dump_Success), |
| DumperNid (-1), |
| DumperPid (-1), |
| DumperVerifier (-1), |
| priorPid_ (0), |
| State_ (State_Unknown), |
| next_(NULL), |
| prev_(NULL), |
| nextL_(NULL), |
| prevL_(NULL), |
| unsolTmSyncCount_(0), |
| Last_error (MPI_SUCCESS) |
| , argc_(0) |
| , userArgvLen_ (0) |
| , userArgv_ (NULL) |
| #ifdef NAMESERVER_PROCESS |
| , path_(path) |
| , ldpath_(ldpath) |
| , program_(program) |
| #else |
| , path_() |
| , ldpath_() |
| , program_() |
| , programStrId_(programStrId) |
| , pathStrId_(pathStrId) |
| , ldpathStrId_(ldpathStrId) |
| #endif |
| , firstInstance_(true) |
| , cmpOrEsp_(false) |
| , trafConf_() |
| , trafHome_() |
| , trafVar_() |
| , fd_stdin_(-1) |
| , fd_stdout_(-1) |
| , fd_stderr_(-1) |
| , owned_(false) |
| , ownerId_(0) |
| , replRefCount_(0) |
| , requestBuf_ (NULL) |
| #ifndef NAMESERVER_PROCESS |
| , NoticeHead(NULL) |
| , NoticeTail(NULL) |
| #endif |
| #ifdef NAMESERVER_PROCESS |
| , monSockFd_(-1) |
| , origPNidNs_(-1) |
| #endif |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcess::CProcess"; |
| TRACE_ENTRY; |
| |
| // Add eyecatcher sequence as a debugging aid |
| memcpy(&eyecatcher_, "PROC", 4); |
| |
| hangupTime_.tv_sec = 0; |
| hangupTime_.tv_nsec = 0; |
| |
| Port[0] = '\0'; |
| STRCPY (Name, name); |
| CreationTime.tv_sec = 0; |
| CreationTime.tv_nsec = 0; |
| if ( infile && strcmp(infile,"#default") != 0) |
| infile_ = infile; |
| if ( outfile && strcmp(outfile,"#default") != 0) |
| outfile_ = outfile; |
| |
| #ifndef NAMESERVER_PROCESS |
| Config->strIdToString(programStrId_, program_ ); |
| #endif |
| |
| switch (Type) |
| { |
| case ProcessType_ASE: |
| case ProcessType_TSE: |
| Priority = (priority<TSE_BASE_NICE?TSE_BASE_NICE:priority); |
| break; |
| case ProcessType_DTM: |
| Priority = (priority<DTM_BASE_NICE?DTM_BASE_NICE:priority); |
| break; |
| case ProcessType_NameServer: |
| case ProcessType_Watchdog: |
| case ProcessType_PSD: |
| Priority = priority; |
| break; |
| case ProcessType_AMP: |
| case ProcessType_Backout: |
| case ProcessType_VolumeRecovery: |
| case ProcessType_MXOSRVR: |
| case ProcessType_PERSIST: |
| case ProcessType_SMS: |
| case ProcessType_SPX: |
| case ProcessType_SSMP: |
| case ProcessType_TMID: |
| case ProcessType_Generic: |
| Priority = (priority<APP_BASE_NICE?APP_BASE_NICE:priority); |
| break; |
| default: |
| Priority = priority; |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::CProcess], Invalid process type!\n"); |
| mon_log_write(MON_PROCESS_PROCESS_1, SQ_LOG_ERR, la_buf); |
| } |
| |
| switch (Type) |
| { |
| case ProcessType_DTM: |
| case ProcessType_PSD: |
| case ProcessType_PERSIST: |
| case ProcessType_SMS: |
| case ProcessType_SPX: |
| case ProcessType_SSMP: |
| case ProcessType_TMID: |
| case ProcessType_Watchdog: |
| case ProcessType_NameServer: |
| Persistent = true; |
| break; |
| default: |
| break; |
| } |
| |
| if (parent) |
| { |
| // the process is being started at the request of a parent process |
| Parent_Nid = parent->Nid; |
| Parent_Pid = parent->Pid; |
| Parent_Verifier = parent->Verifier; |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Process (%d, %d) has parent (%d, %d)\n", method_name, __LINE__, Nid, Pid, Parent_Nid, Parent_Pid); |
| if (Backup) |
| { |
| PairParentNid = parent->PairParentNid; |
| PairParentPid = parent->PairParentPid; |
| parent->Parent_Nid = Nid; |
| parent->Parent_Pid = Pid; |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Backup process %s (%d, %d) has process " |
| "pair parent (%d, %d) and primary process " |
| "(%d, %d)\n", |
| method_name, __LINE__, Name, Nid, Pid, |
| PairParentNid, PairParentPid, |
| parent->Nid, parent->Pid); |
| } |
| } |
| else |
| { |
| // the process is being started by the monitor at initiation time |
| Parent_Nid = -1; |
| Parent_Pid = -1; |
| Parent_Verifier = -1; |
| if (backup) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::CProcess], No Primary for Backup process!\n"); |
| mon_log_write(MON_PROCESS_PROCESS_2, SQ_LOG_ERR, la_buf); |
| } |
| } |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d" " - Process " "%s (nid=%d, priority=%d)" " created @ " "%p""\n", method_name, __LINE__, Name, Nid, Priority, this); |
| |
| Monitor->IncProcessCount(); |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->NumProcsIncr(); |
| |
| TRACE_EXIT; |
| } |
| |
| CProcess::~CProcess (void) |
| { |
| const char method_name[] = "CProcess::~CProcess"; |
| TRACE_ENTRY; |
| Monitor->DecrProcessCount(); |
| |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d" " - Process " "%s(%d,%d:%d)" " destroyed @ " "%p""\n", method_name, __LINE__, Name, Nid, Pid, Verifier, this); |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->NumProcsDecr(); |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| deathInterestLock_.lock(); |
| CNotice *notice = NoticeHead; |
| while (notice) |
| { |
| // Send death notice messages to all opened processes |
| notice->DeLink (&NoticeHead, &NoticeTail); |
| delete notice; |
| |
| notice = NoticeHead; |
| } |
| deathInterestLock_.unlock(); |
| #endif |
| |
| // For SSM process, release any undelivered pending notices. |
| struct message_def * deathNotice; |
| while ((deathNotice = GetDeathNotice()) != NULL) |
| { |
| delete deathNotice; |
| } |
| |
| delete [] userArgv_; |
| |
| #ifndef NAMESERVER_PROCESS |
| if (fd_stdin_ != -1 && !Clone) |
| { |
| Redirector.tryShutdownPipeFd(Pid, fd_stdin_, false); |
| } |
| |
| if (fd_stdout_ != -1) |
| { |
| Redirector.tryShutdownPipeFd(Pid, fd_stdout_, true); |
| } |
| |
| if (fd_stderr_ != -1) |
| { |
| Redirector.tryShutdownPipeFd(Pid, fd_stderr_, false); |
| } |
| #endif |
| |
| // Remove the fifos associated with this process (if any) |
| if (fifo_stdin_.size() != 0) |
| { |
| unlink(fifo_stdin_.c_str()); |
| } |
| |
| if (fifo_stdout_.size() != 0) |
| { |
| unlink(fifo_stdout_.c_str()); |
| } |
| |
| if (fifo_stderr_.size() != 0) |
| { |
| unlink(fifo_stderr_.c_str()); |
| } |
| |
| // Alter eyecatcher sequence as a debugging aid to identify deleted object |
| memcpy(&eyecatcher_, "proc", 4); |
| |
| TRACE_EXIT; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| const char* CProcess::path() |
| { |
| Config->strIdToString(pathStrId_, path_ ); |
| return( path_.c_str() ); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| const char* CProcess::ldpath() |
| { |
| Config->strIdToString(ldpathStrId_, ldpath_ ); |
| return( ldpath_.c_str() ); |
| } |
| #endif |
| |
| int CProcess::getUserArgs( char user_argv[MAX_ARGS][MAX_ARG_SIZE] ) |
| { |
| const char *pUserArgv = userArgv_; |
| int i, arglen; |
| for (i = 0; i < argc_; i++) |
| { |
| arglen = strlen (pUserArgv) + 1; |
| strcpy( user_argv[i], pUserArgv ); |
| pUserArgv += arglen; |
| } |
| strcpy( user_argv[i], "" ); |
| return(argc_); |
| } |
| |
| void CProcess::userArgs ( int argc, int argvLen, const char * argvList ) |
| { |
| const char method_name[] = "CProcess::userArgs"; |
| TRACE_ENTRY; |
| |
| argc_ = argc; |
| userArgvLen_ = argvLen; |
| if ( userArgv_ != NULL ) |
| { |
| delete[] userArgv_; |
| } |
| userArgv_ = new char[ argvLen ]; |
| memcpy(userArgv_, argvList, argvLen); |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcess::userArgs ( int argc, char user_argv[MAX_ARGS][MAX_ARG_SIZE] ) |
| { |
| const char method_name[] = "CProcess::userArgs"; |
| TRACE_ENTRY; |
| |
| argc_ = argc; |
| |
| // Compute amount of space need to store argument strings |
| userArgvLen_ = 0; |
| for (int i = 0; i < argc; i++) |
| { |
| userArgvLen_ += strlen(user_argv[i]) + 1; |
| } |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Copying arguments argc=%d, argvSize=%d\n", |
| method_name, __LINE__, argc, userArgvLen_); |
| if (userArgvLen_ != 0) |
| { |
| userArgv_ = new char[userArgvLen_]; |
| } |
| |
| char *pUserArgv = userArgv_; |
| for (int i = 0; i < argc; i++) |
| { |
| if (trace_settings & (TRACE_PROCESS_DETAIL | TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - name=%s, Copying user_argv[%d]='%s'\n", method_name, __LINE__, Name, i, user_argv[i]); |
| strcpy (pUserArgv, user_argv[i]); |
| pUserArgv += strlen(user_argv[i]) + 1; |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcess::validateObj( void ) |
| { |
| if (strncmp((const char *)&eyecatcher_, "PROC", 4) !=0 ) |
| { // Not a valid object |
| abort(); |
| } |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcess::CancelDeathNotification( int nid |
| , int pid |
| , Verifier_t verifier |
| , _TM_Txid_External trans_id ) |
| { |
| bool status = FAILURE; |
| CNotice *next; |
| |
| const char method_name[] = "CProcess::CancelDeathNotification"; |
| TRACE_ENTRY; |
| |
| deathInterestLock_.lock(); |
| CNotice *notice = NoticeHead; |
| |
| while( notice ) |
| { |
| if ((( notice->Nid == nid ) && |
| ( notice->Pid == pid ) && |
| ( notice->verifier_ == verifier ) && |
| ( isInvalid( trans_id ) || isEqual( notice->TransID, trans_id ))) |
| || (( nid == -1 || pid == -1 ) && |
| ( isEqual(notice->TransID, trans_id) ) ) ) |
| { |
| next = notice->GetNext(); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL |
| | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d - Process %s (%d, %d:%d) deleting death " |
| "notice interest for %s (%d, %d:%d), " |
| "trans_id=%lld.%lld.%lld.%lld\n" |
| , method_name, __LINE__ |
| , Name |
| , Nid |
| , Pid |
| , Verifier |
| , notice->name_.c_str() |
| , notice->Nid |
| , notice->Pid |
| , notice->verifier_ |
| , notice->TransID.txid[0] |
| , notice->TransID.txid[1] |
| , notice->TransID.txid[2] |
| , notice->TransID.txid[3] ); |
| } |
| |
| notice->DeLink(&NoticeHead, &NoticeTail); |
| delete notice; |
| |
| notice = next; |
| |
| status = SUCCESS; |
| } |
| else |
| { |
| notice = notice->GetNext(); |
| } |
| } |
| |
| deathInterestLock_.unlock(); |
| |
| TRACE_EXIT; |
| return status; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| // Death notice registration for a process |
| bool CProcess::procExitReg(CProcess *targetProcess, |
| _TM_Txid_External transId) |
| { |
| const char method_name[] = "CProcess::ProcExitReg"; |
| TRACE_ENTRY; |
| |
| bool status = FAILURE; |
| |
| if ( Nid != targetProcess->GetParentNid() || |
| Pid != targetProcess->GetParentPid()) |
| { // This process is not the parent of the target process (parent |
| // processes automatically get process death notifications.) |
| |
| nidPid_t target = { targetProcess->Nid, targetProcess->Pid }; |
| deathInterestLock_.lock(); |
| // Add entry to list of processes that are being monitored |
| // by this process. |
| deathInterest_.push_back( target ); |
| // Add entry to set of nids of processes that are being monitored |
| // by this process. |
| deathInterestNid_.insert( targetProcess->Nid ); |
| deathInterestLock_.unlock(); |
| |
| // Register interest with the target process |
| targetProcess->RegisterDeathNotification( Nid |
| , Pid |
| , Verifier |
| , Name |
| , transId ); |
| status = SUCCESS; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf("%s@%d - Process %s (%d, %d) registered interest " |
| "in death of process %s (%d, %d), " |
| "trans_id=%lld.%lld.%lld.%lld\n", |
| method_name, __LINE__, Name, Nid, Pid, |
| targetProcess->Name, targetProcess->Nid, |
| targetProcess->Pid, |
| transId.txid[0], transId.txid[1], transId.txid[2], |
| transId.txid[3] ); |
| } |
| } |
| |
| |
| TRACE_EXIT; |
| return status; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::procExitNotifierNodes( void ) |
| { |
| const char method_name[] = "CProcess::procExitNotifierNodes"; |
| TRACE_ENTRY; |
| |
| CLNode *targetLNode = NULL; |
| CNode *targetNode = NULL; |
| nidSet_t::iterator it; |
| |
| // Remove death notice registration for all entries on list |
| deathInterestLock_.lock(); |
| for ( it = deathInterestNid_.begin(); it != deathInterestNid_.end(); ++it) |
| { |
| targetLNode = Nodes->GetLNode ( *it ); |
| if (targetLNode) |
| { |
| targetNode = targetLNode->GetNode(); |
| } |
| |
| if ( targetNode ) |
| { |
| if (NameServerEnabled && targetNode->GetPNid() != MyPNID) |
| { |
| // Forward the process exit to the target node |
| int rc = PtpClient->ProcessExit( this |
| , targetLNode->GetNid() |
| , targetNode->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process exit " |
| "for process %s (%d, %d) " |
| "to target node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , targetLNode->GetNode()->GetName() |
| , targetLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_PROCEXITNOTIFIERNODES_1, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| } |
| deathInterestNid_.clear(); |
| deathInterestLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::procExitUnregAll ( _TM_Txid_External transId ) |
| { |
| const char method_name[] = "CProcess::procExitUnregAll"; |
| TRACE_ENTRY; |
| |
| CLNode *node; |
| CProcess *targetProcess = NULL; |
| nidPidList_t::iterator it; |
| |
| // Remove death notice registration for all entries on list |
| deathInterestLock_.lock(); |
| for ( it = deathInterest_.begin(); it != deathInterest_.end(); ++it) |
| { |
| node = Nodes->GetLNode ( it->nid ); |
| targetProcess = NULL; |
| if (node) |
| { |
| targetProcess = node->GetProcessL( it->pid ); |
| } |
| |
| if ( targetProcess ) |
| { |
| if (NameServerEnabled && targetProcess->IsClone()) |
| { |
| CLNode *targetLNode = Nodes->GetLNode( targetProcess->GetNid() ); |
| |
| int rc = -1; |
| // Forward the process cancel death notification to the target node |
| rc = PtpClient->ProcessNotify( targetProcess->GetNid() |
| , targetProcess->GetPid() |
| , targetProcess->GetVerifier() |
| , transId |
| , true // cancel target's death notification |
| , this // of this process |
| , targetLNode->GetNid() |
| , targetLNode->GetNode()->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process notify request " |
| "for process %s (%d, %d) " |
| "to target node %s, nid=%d\n" |
| , method_name |
| , targetProcess->GetName() |
| , targetProcess->GetNid() |
| , targetProcess->GetPid() |
| , targetLNode->GetNode()->GetName() |
| , targetLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_PROCEXITUNREGALL_1, SQ_LOG_ERR, la_buf); |
| } |
| } |
| |
| targetProcess->CancelDeathNotification( Nid |
| , Pid |
| , Verifier |
| , transId ); |
| } |
| } |
| deathInterest_.clear(); |
| deathInterestLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::childAdd ( int nid, int pid ) |
| { |
| const char method_name[] = "CProcess::childAdd"; |
| TRACE_ENTRY; |
| |
| nidPid_t child = { nid, pid }; |
| childrenListLock_.lock(); |
| children_.push_back ( child ); |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| int CProcess::childCount ( void ) |
| { |
| const char method_name[] = "CProcess::childCount"; |
| TRACE_ENTRY; |
| |
| childrenListLock_.lock(); |
| int count = children_.size(); |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| return(count); |
| } |
| |
| void CProcess::childRemove ( int nid, int pid ) |
| { |
| const char method_name[] = "CProcess::childRemove"; |
| TRACE_ENTRY; |
| |
| nidPidList_t::iterator it; |
| |
| childrenListLock_.lock(); |
| for ( it = children_.begin(); it != children_.end(); ++it) |
| { |
| if (it->nid == nid && it->pid == pid ) |
| { |
| children_.erase ( it ); |
| break; |
| } |
| } |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| bool CProcess::childRemoveFirst ( nidPid_t & child) |
| { |
| const char method_name[] = "CProcess::childRemoveFirst"; |
| TRACE_ENTRY; |
| |
| bool result = false; |
| |
| childrenListLock_.lock(); |
| if ( !children_.empty() ) |
| { |
| child = children_.front (); |
| children_.pop_front (); |
| result = true; |
| |
| } |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| |
| return result; |
| } |
| |
| void CProcess::childUnHookedAdd( int nid, int pid ) |
| { |
| const char method_name[] = "CProcess::childUnHookedAdd"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d adding unhooked child (%d:%d)\n" |
| , method_name, __LINE__ |
| , nid, pid ); |
| |
| nidPid_t child = { nid, pid }; |
| childrenListLock_.lock(); |
| childrenUnHooked_.push_back ( child ); |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| int CProcess::childUnHookedCount( void ) |
| { |
| const char method_name[] = "CProcess::childUnHookedCount"; |
| TRACE_ENTRY; |
| |
| childrenListLock_.lock(); |
| int count = childrenUnHooked_.size(); |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| return(count); |
| } |
| |
| void CProcess::childUnHookedRemove( int nid, int pid ) |
| { |
| const char method_name[] = "CProcess::childUnHookedRemove"; |
| TRACE_ENTRY; |
| |
| nidPidList_t::iterator it; |
| |
| childrenListLock_.lock(); |
| for ( it = childrenUnHooked_.begin(); it != childrenUnHooked_.end(); ++it) |
| { |
| if (it->nid == nid && it->pid == pid ) |
| { |
| childrenUnHooked_.erase ( it ); |
| break; |
| } |
| } |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| bool CProcess::childUnHookedRemoveFirst( nidPid_t & child) |
| { |
| const char method_name[] = "CProcess::childUnHookedRemoveFirst"; |
| TRACE_ENTRY; |
| |
| bool result = false; |
| |
| childrenListLock_.lock(); |
| if ( !childrenUnHooked_.empty() ) |
| { |
| child = childrenUnHooked_.front (); |
| childrenUnHooked_.pop_front (); |
| result = true; |
| |
| } |
| childrenListLock_.unlock(); |
| |
| TRACE_EXIT; |
| |
| return result; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::CompleteDump(DUMPSTATUS status, char *core_file) |
| { |
| CProcess *dumper; |
| struct message_def *msg; |
| |
| const char method_name[] = "CProcess::CompleteDump"; |
| TRACE_ENTRY; |
| |
| DumpStatus = status; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Dumper Process nid=%d, pid=%d:%d\n", |
| method_name, __LINE__, DumperNid, DumperPid, DumperVerifier); |
| dumper = Nodes->GetLNode (DumperNid)->GetProcessL(DumperPid); |
| if (dumper && MyNode->IsMyNode(DumperNid)) |
| { |
| if ( (DumperVerifier == -1) || (DumperVerifier == dumper->GetVerifier()) ) |
| { |
| msg = parentContext(); |
| if ( msg ) |
| { // reply to parent pending, so send reply |
| msg->noreply = false; |
| msg->type = MsgType_Service; |
| msg->u.reply.type = ReplyType_Dump; |
| msg->u.reply.u.dump.nid = Nid; |
| msg->u.reply.u.dump.pid = Pid; |
| msg->u.reply.u.dump.verifier = Verifier; |
| if (status == Dump_Success) |
| { |
| STRCPY(msg->u.reply.u.dump.core_file, core_file); |
| msg->u.reply.u.dump.return_code = MPI_SUCCESS; |
| } |
| else |
| { |
| msg->u.reply.u.dump.core_file[0] = 0; |
| msg->u.reply.u.dump.return_code = MPI_ERR_EXITED; |
| } |
| CRequest::lioreply (msg, dumper->GetPid()); |
| parentContext( NULL ); |
| } |
| } |
| } |
| |
| DumpState = Dump_Ready; |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messages, |
| bool system_messages, bool preclone, |
| struct timespec *creation_time, int /*origPNidNs*/) |
| { |
| const char method_name[] = "CProcess::CompleteProcessStartup"; |
| TRACE_ENTRY; |
| |
| STRCPY (Port, port); |
| Pid = os_pid; |
| Event_messages = event_messages; |
| System_messages = system_messages; |
| |
| if (preclone) |
| { |
| Clone = true; |
| } |
| |
| if (!Clone) |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d: process %s (%d, %d), preclone=%d" |
| ", clone=%d\n", |
| method_name, __LINE__, Name, |
| Nid, os_pid, preclone, Clone); |
| StartupCompleted = true; |
| if (os_pid != -1) |
| { |
| if ( MyNode->IsMyNode(Nid) ) |
| { |
| if ( NameServerEnabled ) |
| { |
| int rc = -1; |
| // Register process in Name Server |
| rc = NameServer->ProcessNew(this); // in reqQueue thread (CExtStartupReq) |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't register new process " |
| "%s (%d, %d) " |
| "to Name Server process\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() ); |
| mon_log_write(MON_PROCESS_COMPLETESTARTUP_1, SQ_LOG_ERR, la_buf); |
| } |
| |
| if (Parent_Nid != -1) |
| { |
| if (Parent_Nid != Nid) |
| { |
| // Tell the parent node the current state of the process |
| rc = PtpClient->ProcessClone(this); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CLNode *parentLNode = NULL; |
| parentLNode = Nodes->GetLNode( GetParentNid() ); |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process clone request" |
| "for process %s (%d, %d) " |
| "to parent node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , parentLNode->GetNode()->GetName() |
| , parentLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_COMPLETESTARTUP_2, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| } |
| else |
| { |
| // Replicate the clone to other nodes |
| CReplClone *repl = new CReplClone(this); |
| Replicator.addItem(repl); |
| } |
| } |
| else |
| { |
| Clone = true; |
| } |
| } |
| else |
| { |
| // TODO: What does an os_pid == -1 mean? |
| if ( NameServerEnabled ) |
| { |
| if (Parent_Nid != -1) |
| { |
| if (Parent_Nid != Nid) |
| { |
| int rc = -1; |
| // Tell the parent node the current state of the process |
| rc = PtpClient->ProcessClone(this); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CLNode *parentLNode = NULL; |
| parentLNode = Nodes->GetLNode( GetParentNid() ); |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process clone request" |
| "for process %s (%d, %d) " |
| "to parent node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , parentLNode->GetNode()->GetName() |
| , parentLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_COMPLETESTARTUP_3, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| } |
| else |
| { |
| // Replicate the clone to other nodes |
| CReplClone *repl = new CReplClone(this); |
| Replicator.addItem(repl); |
| } |
| } |
| } |
| |
| if (!Clone) |
| { |
| // check if we need to setup any associated devices. |
| if ((Type == ProcessType_TSE) || |
| (Type == ProcessType_ASE) ) |
| { |
| Devices->CreateDevice( this ); |
| } |
| |
| if ((Type == ProcessType_TSE) || |
| (Type == ProcessType_DTM) || |
| (Type == ProcessType_ASE) ) |
| { |
| MyNode->addToQuiesceSendPids( GetPid(), GetVerifier() ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s%d: pid %d added to quiesce send list\n", method_name, __LINE__, GetPid()); |
| } |
| |
| if ((Type == ProcessType_TSE) || |
| (Type == ProcessType_ASE) ) |
| { |
| MyNode->addToQuiesceExitPids( GetPid(), GetVerifier() ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s%d: pid %d added to quiesce exit list\n", method_name, __LINE__, GetPid()); |
| } |
| } |
| |
| if ( Clone && !preclone ) |
| { |
| StartupCompleted = true; |
| if (creation_time != NULL) |
| CreationTime = *creation_time; |
| } |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d: process %s (%d, %d:%d), preclone=%d" |
| ", clone=%d, StartupCompleted=%d\n", |
| method_name, __LINE__, Name, Nid, os_pid, Verifier, preclone, |
| Clone, StartupCompleted); |
| State_ = State_Up; |
| |
| // Check if node is shutting down |
| if ( !Clone && MyNode->GetState() == State_Shutdown ) |
| { |
| if ( MyNode->GetShutdownLevel() == ShutdownLevel_Abrupt ) |
| { |
| // killing the process will not remove the process object because |
| // exit processing will get queued until this completes. |
| kill( Pid, SIGKILL ); |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Shutdown abrupt in process, completed kill for %s (%d, %d)\n" |
| , method_name, __LINE__, Name, Nid, os_pid); |
| } |
| else |
| { |
| struct message_def *msg; |
| |
| msg = new struct message_def; |
| msg->type = MsgType_Shutdown; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| msg->u.request.u.shutdown.nid = Nid; |
| msg->u.request.u.shutdown.pid = -1; |
| msg->u.request.u.shutdown.level = MyNode->GetShutdownLevel(); |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d" " - Sending shutdown notice, level=%d\n" |
| , method_name, __LINE__, MyNode->GetShutdownLevel() ); |
| // Send shutdown notice |
| SQ_theLocalIOToClient->putOnNoticeQueue( Pid, Verifier, msg, NULL ); |
| } |
| } |
| |
| // some special handling for native processes |
| if ( !Clone ) |
| { |
| ssmpNoticesLock_.lock(); |
| if ( Type == ProcessType_SSMP && !ssmpNotices_.empty()) |
| { // Some death notices are queued for this SSMP process. Signal |
| // the notifier to get to work on delivering them. |
| SQ_theLocalIOToClient->nudgeNotifier (); |
| } |
| ssmpNoticesLock_.unlock(); |
| |
| if ( Type == ProcessType_SMS ) |
| { |
| // let healthcheck thread know that the SMService process is up and running. |
| HealthCheck.setState(HC_UPDATE_SMSERVICE, (long long)this); |
| } |
| if ( Type == ProcessType_Watchdog ) |
| { |
| // let healthcheck thread know that the watchdog process is up and running. |
| HealthCheck.setState(HC_UPDATE_WATCHDOG, (long long)this); |
| // start the watchdog timer |
| HealthCheck.setState(MON_START_WATCHDOG); |
| } |
| if ( Type == ProcessType_PSD && |
| (IAmIntegrated || MyNode->IsActivatingSpare() || MyNode->IsSoftNodeDown()) ) |
| { |
| MyNode->StartPStartDPersistent(); |
| |
| if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_INIT)) |
| trace_printf("%s%d: Sent start persistent processes event to PSD process %s (pid=%d)\n", method_name, __LINE__, GetName(), GetPid()); |
| } |
| if ( Type == ProcessType_DTM && |
| MyNode->IsSoftNodeDown() ) |
| { |
| // Tell remote DTMs that this DTM was restarted |
| Monitor->SoftNodeUpPrepare( MyPNID ); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CProcess::CompleteRequest( int status ) |
| { |
| #ifndef NAMESERVER_PROCESS |
| struct message_def *msg; |
| #endif |
| |
| const char method_name[] = "CProcess::CompleteRequest"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Process %s (%d,%d:%d), status %d\n", |
| method_name, __LINE__, Name, Nid, Pid, Verifier, status); |
| |
| #ifndef NAMESERVER_PROCESS |
| if ( !Clone ) |
| { |
| msg = parentContext(); |
| if ( msg ) |
| { // reply pending, so send reply |
| msg->noreply = false; |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = Nid; |
| msg->u.reply.u.generic.pid = Pid; |
| msg->u.reply.u.generic.verifier = Verifier; |
| msg->u.reply.u.generic.process_name[0] = '\0'; |
| msg->u.reply.u.generic.return_code = status; |
| |
| CRequest::lioreply (msg, Pid); |
| parentContext( NULL ); |
| } |
| } |
| #endif |
| |
| TRACE_EXIT; |
| } |
| |
| bool CProcess::PickStdfile(PickStdFile_t whichStdfile, |
| char (&Destfile)[MAX_PROCESS_PATH], |
| int &ancestorNid, int &ancestorPid) |
| { |
| const char method_name[] = "CProcess::PickStdfile"; |
| TRACE_ENTRY; |
| |
| CLNode *node = NULL; |
| CProcess *ancestor; |
| int nextNid = -1; |
| int nextPid = 0; |
| |
| if (whichStdfile == PICK_STDOUT) |
| { |
| if (!outfile_.empty()) |
| { |
| STRCPY(Destfile, outfile_.c_str()); |
| TRACE_EXIT; |
| return true; |
| } |
| } |
| else |
| { |
| if (!infile_.empty()) |
| { |
| STRCPY(Destfile, infile_.c_str()); |
| TRACE_EXIT; |
| return true; |
| } |
| } |
| |
| nextNid = Parent_Nid; |
| nextPid = Parent_Pid; |
| Destfile[0] = '\0'; |
| bool retVal = true; |
| |
| // Keep track of process creation times to avoid looping forever. |
| struct timespec earlyCreationTime; |
| earlyCreationTime.tv_sec = CreationTime.tv_sec; |
| earlyCreationTime.tv_nsec = CreationTime.tv_nsec; |
| |
| while(true) |
| { |
| node = Nodes->GetLNode (nextNid); |
| if (node) |
| { |
| ancestor = node->GetProcessL(nextPid); |
| if ( ancestor && |
| (( ! MyNode->IsMyNode(ancestor->GetNid())) || |
| (ancestor->CreationTime.tv_sec < earlyCreationTime.tv_sec || |
| (ancestor->CreationTime.tv_sec == earlyCreationTime.tv_sec && |
| ancestor->CreationTime.tv_nsec < earlyCreationTime.tv_nsec))) ) |
| { |
| earlyCreationTime.tv_sec = ancestor->CreationTime.tv_sec; |
| earlyCreationTime.tv_nsec = ancestor->CreationTime.tv_nsec; |
| |
| if (whichStdfile == PICK_STDOUT && (ancestor->outfile())[0]) |
| { |
| // The ancestor specified a standard outfile |
| if ( MyNode->IsMyNode(nextNid) ) |
| { // The ancestor and this process are on the same node |
| STRCPY(Destfile, ancestor->outfile()); |
| } |
| else |
| { // The ancestor is on a different node. |
| ancestorNid = nextNid; |
| ancestorPid = nextPid; |
| } |
| |
| break; |
| } |
| else if (whichStdfile == PICK_STDIN && (ancestor->infile())[0]) |
| { |
| // The ancestor specified a standard outfile |
| if ( MyNode->IsMyNode(nextNid) ) |
| { // The ancestor and this process are on the same node |
| STRCPY(Destfile, ancestor->infile()); |
| } |
| else |
| { // The ancestor is on a different node. |
| ancestorNid = nextNid; |
| ancestorPid = nextPid; |
| } |
| |
| break; |
| } |
| else |
| { // The ancestor process did not specify a stdout file |
| // so next examine ancestor's parent. |
| if (Backup || ancestor->Backup) |
| { |
| nextNid = ancestor->PairParentNid; |
| nextPid = ancestor->PairParentPid; |
| } |
| else |
| { |
| nextNid = ancestor->Parent_Nid; |
| nextPid = ancestor->Parent_Pid; |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d could not find process object for " |
| "pid=%d\n", |
| method_name, __LINE__, nextPid); |
| retVal = false; |
| break; |
| } |
| } |
| else |
| { // Unexpectedly could not find node object |
| // log error |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d could not find node object for nid=%d\n", |
| method_name, __LINE__, nextNid); |
| |
| if (nextNid != -1) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "%s, Unable to find node object for nid=%d\n", |
| method_name, nextNid); |
| mon_log_write(MON_PROCESS_PICKSTDFILE_2, SQ_LOG_ERR, buf); |
| } |
| retVal = false; |
| break; |
| } |
| } |
| |
| TRACE_EXIT; |
| |
| return retVal; |
| } |
| |
| // for attached processes, |
| // set CreationTime to last modification time of /proc/<pid>/cmdline |
| // for unattached process, |
| // set CreationTime to current time (fork time) |
| void CProcess::SetCreationTime(int os_pid) |
| { |
| if (os_pid == -1) |
| { |
| struct timespec ts; |
| int err = clock_gettime(CLOCK_REALTIME, &ts); |
| if (err == 0) |
| CreationTime = ts; |
| } else |
| { |
| char statline[40]; |
| struct stat statbuf; |
| snprintf(statline, sizeof(statline), "/proc/%d/cmdline", os_pid); |
| int err = stat(statline, &statbuf); |
| if (err == 0) |
| CreationTime = statbuf.st_mtim; |
| } |
| } |
| |
| void CProcess::SetVerifier() |
| { |
| Verifier = Monitor->incrGetVerifierNum(); |
| return; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::SetupFifo(int attachee_nid, int attachee_pid) |
| { |
| const char method_name[] = "CProcess::SetupFifo"; |
| TRACE_ENTRY; |
| |
| // reset umask (group needs write permissions for fifo) |
| mode_t prev_mask; |
| prev_mask = umask(S_IWOTH); |
| |
| |
| // Get the file name for the attached process's current standard in file |
| char std_name[MAX_PROCESS_PATH]; |
| char filepath[30]; |
| ssize_t std_name_len; |
| snprintf (filepath, sizeof(filepath), "/proc/%d/fd/0", attachee_pid); |
| std_name_len = readlink (filepath, std_name, MAX_PROCESS_PATH-1); |
| if (std_name_len < 0) std_name_len = 0; |
| std_name[std_name_len] = '\0'; |
| |
| if ((std_name_len >= 9) |
| && (strcmp(&std_name[std_name_len-9], "(deleted)") != 0)) |
| { |
| // Record the infile name in the process object |
| infile_ = std_name; |
| } |
| else if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d Not saving stdin file %s for pid=%d\n", |
| method_name, __LINE__, std_name, attachee_pid); |
| |
| // Get the file name for the attached process's current standard out file |
| snprintf (filepath, sizeof(filepath), "/proc/%d/fd/1", attachee_pid); |
| std_name_len = readlink (filepath, std_name, MAX_PROCESS_PATH-1); |
| if (std_name_len < 0) std_name_len = 0; |
| std_name[std_name_len] = '\0'; |
| |
| // Record the outfile name in the process object. Any child |
| // process created by it may write to the pipe. |
| if (strncmp(std_name, "pipe:", 5) != 0) |
| { // The attach process has a device or file for its standard output. |
| outfile_ = std_name; |
| } |
| else |
| { // The attached process has a pipe for its standard output. |
| outfile_ = filepath; |
| } |
| |
| // Create unique fifo name, store in process object |
| bool fifo_ok = true; |
| char fifo_stdout[50]; |
| strcpy(fifo_stdout, "/tmp/sqmp.XXXXXX"); |
| int fifo_stdout_fd = mkstemp(fifo_stdout); |
| if (fifo_stdout_fd == -1) |
| { // Unexpected mkstemp problem |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], mkstemp(%s) error, %s.\n", method_name, |
| fifo_stdout, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_1, SQ_LOG_ERR, buf); |
| fifo_ok = false; |
| } |
| if (fifo_ok) |
| { |
| fifo_stdout_ = fifo_stdout; |
| // unlink so mkfifo works |
| int err = unlink(fifo_stdout); |
| if (err == -1) |
| { // Unexpected unlink problem |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], unlink(%s) error, %s.\n", method_name, |
| fifo_stdout, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_2, SQ_LOG_ERR, buf); |
| fifo_ok = false; |
| } |
| } |
| if (fifo_ok) |
| { |
| if (mkfifo(fifo_stdout, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)) |
| { // Unexpected fifo creation problem |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], mkfifo(%s) error, %s.\n", method_name, |
| fifo_stdout, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_3, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| // Open the fifo for reading. Use non-blocking mode because |
| // otherwise open would not complete until attached process |
| // opens fifo for writing. |
| fd_stdout_ = open (fifo_stdout, O_RDONLY | O_NONBLOCK); |
| if (fd_stdout_ == -1) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], fifo open(%s) error, %s.\n", method_name, |
| fifo_stdout, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_4, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| // close the unlinked file |
| close(fifo_stdout_fd); |
| } |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| Redirector.stdoutFd(attachee_nid, attachee_pid, fd_stdout_, outfile_.c_str(), |
| -1, -1); |
| #endif |
| } |
| |
| // Create unique stderr fifo name, store in process object |
| fifo_ok = true; |
| char fifo_stderr[50]; |
| strcpy(fifo_stderr, "/tmp/sqmp.XXXXXX"); |
| int fifo_stderr_fd = mkstemp(fifo_stderr); |
| if (fifo_stderr_fd == -1) |
| { // Unexpected mkstemp problem |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], mkstemp(%s) error, %s.\n", method_name, |
| fifo_stderr, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_5, SQ_LOG_ERR, buf); |
| fifo_ok = false; |
| } |
| |
| if (fifo_ok) |
| { |
| fifo_stderr_ = fifo_stderr; |
| // unlink so mkfifo works |
| int err = unlink(fifo_stderr); |
| if (err == -1) |
| { // Unexpected unlink problem |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], unlink(%s) error, %s.\n", method_name, |
| fifo_stderr, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_6, SQ_LOG_ERR, buf); |
| fifo_ok = false; |
| } |
| } |
| if (fifo_ok) |
| { |
| if (mkfifo(fifo_stderr, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP)) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], mkfifo(%s) error, %s.\n", method_name, fifo_stderr, |
| strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_7, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| fd_stderr_ = open (fifo_stderr, O_RDONLY | O_NONBLOCK); |
| if (fd_stderr_ == -1) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], fifo open(%s) error, %s.\n", method_name, |
| fifo_stderr, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPFIFO_8, SQ_LOG_ERR, buf); |
| } |
| else |
| { |
| // close the unlinked file |
| close(fifo_stderr_fd); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| Redirector.stderrFd(MyNode->GetHostname(), Name, Nid, attachee_pid, fd_stderr_); |
| #endif |
| } |
| } |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d Process=%s, Pid=%d, Infile=[%s], " |
| "Outfile=[%s], fifo_stdout=%s, fd_stdout=%d, " |
| "fifo_stderr=%s, fd_stderr=%d\n", |
| method_name, __LINE__, Name, attachee_pid, |
| infile_.c_str(), outfile_.c_str(), fifo_stdout_.c_str(), |
| fd_stdout_, fifo_stderr_.c_str(), fd_stderr_); |
| |
| // Restore previous umask |
| umask(prev_mask); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| // LCOV_EXCL_START |
| // Methods CProcess::SetupPipe and CProcess::RedirectStdFiles are |
| // excluded from code coverage measurement. They are executed only |
| // by a monitor child process not the monitor process itself. Therefore |
| // they do not show up as covered lines when monitor code coverage measurement |
| // is done. |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::SetupPipe(int orig_fd, int unused_pipe_fd, int pipe_fd) |
| { |
| int newfd; |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcess::SetupPipe"; |
| TRACE_ENTRY; |
| |
| // Close original file descriptor |
| if (close(orig_fd)) |
| { |
| snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n", |
| method_name, orig_fd, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPPIPE_1, SQ_LOG_ERR, buf); |
| } |
| |
| // Close unused pipe file descriptor |
| if (close(unused_pipe_fd)) |
| { |
| snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n", method_name, |
| unused_pipe_fd, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPPIPE_2, SQ_LOG_ERR, buf); |
| } |
| |
| // Duplicate pipe file desciptor to original file descriptor number |
| newfd = dup2(pipe_fd, orig_fd); |
| if (newfd == -1) |
| { |
| snprintf(buf, sizeof(buf), "[%s], dup2(%d, %d) error, %s.\n", |
| method_name, pipe_fd, orig_fd, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPPIPE_3, SQ_LOG_ERR, buf); |
| } |
| |
| // Close the pipe file descriptor |
| if (close(pipe_fd)) |
| { |
| snprintf(buf, sizeof(buf), "[%s], close(%d) error, %s.\n", method_name, |
| pipe_fd, strerror(errno)); |
| mon_log_write(MON_PROCESS_SETUPPIPE_4, SQ_LOG_ERR, buf); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::RedirectStdFiles(int pfds_stdin[2], int pfds_stdout[2], |
| int pfds_stderr[2]) |
| |
| { |
| const char method_name[] = "CProcess::RedirectStdFiles"; |
| TRACE_ENTRY; |
| |
| SetupPipe(0, pfds_stdin[1], pfds_stdin[0]); |
| |
| SetupPipe(1, pfds_stdout[0], pfds_stdout[1]); |
| |
| SetupPipe(2, pfds_stderr[0], pfds_stderr[1]); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| // LCOV_EXCL_STOP |
| |
| void CProcess::setEnvStr ( char **envp, int &countEnv, const char *str ) |
| { |
| envp[countEnv] = new char [ strlen(str)+1 ]; |
| strcpy ( envp[countEnv], str ); |
| ++countEnv; |
| } |
| |
| void CProcess::setEnvStrVal ( char **envp, int &countEnv, const char *str, |
| const char *val) |
| { |
| envp[countEnv] = new char [ strlen(str)+strlen(val)+2 ]; |
| sprintf ( envp[countEnv], "%s=%s", str, val ); |
| ++countEnv; |
| } |
| |
| void CProcess::setEnvIntVal ( char **envp, int &countEnv, const char *str, |
| int val) |
| { |
| envp[countEnv] = new char [ strlen(str)+13 ]; |
| sprintf ( envp[countEnv], "%s=%d", str, val ); |
| ++countEnv; |
| } |
| |
| void CProcess::setEnvRegGroupVals(CConfigGroup *group, char **envp, |
| int &countEnv) |
| { |
| CConfigKey *key; |
| |
| const char method_name[] = "CProcess::setEnvRegGroupVals"; |
| TRACE_ENTRY; |
| |
| if (group) |
| { |
| key = group->GetKey((char *) ""); |
| while (key) |
| { |
| if (strncasecmp(key->GetName(), "~US_", 4) != 0) |
| { // Not an internal monitor unique string, ok to set |
| setEnvStrVal(envp, countEnv, key->GetName(), key->GetValue()); |
| } |
| if (countEnv >= MAX_CHILD_ENV_VARS) |
| { |
| break; |
| } |
| |
| key = key->GetNext(); |
| } |
| } |
| TRACE_EXIT; |
| } |
| |
| void CProcess::setEnvFromRegistry ( char **envp, int &countEnv ) |
| { |
| CConfigGroup *group; |
| |
| group = Config->GetClusterGroup(); |
| setEnvRegGroupVals ( group, envp, countEnv ); |
| |
| group = Config->GetLocalNodeGroup(); |
| setEnvRegGroupVals ( group, envp, countEnv ); |
| |
| group = Config->GetGroup(Name); |
| setEnvRegGroupVals ( group, envp, countEnv ); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcess::Create (CProcess *parent, void* tag, int & result) |
| { |
| bool monAltLogEnabled = false; |
| bool seamonsterEnabled = false; |
| bool shellTrace = false; |
| bool successful = false; |
| bool wdtDumpMonitor = false; |
| bool wdtTraceCmd = false; |
| bool wdtTraceInit = false; |
| bool wdtTraceLio = false; |
| bool wdtTraceEntryExit = false; |
| bool wdtKeepAliveTimer = false; |
| bool wdtMonProcRate = false; |
| bool wdtLunmgrHangDelay = false; |
| bool wdtLinuxWatchdog = false; |
| bool wdtStartupTimer = false; |
| int numProcessThreads = 0; |
| int keepAliveValue = 0; |
| int monitorCheckRateValue = 0; |
| int lunmgrHangDelayValue = 0; |
| int startupTimerValue = 0; |
| int i; |
| int j; |
| int rc = -1; |
| int rc2 = -1; |
| char *env; |
| char **argv; |
| char *childEnv[MAX_CHILD_ENV_VARS + 1]; |
| int nextEnv = 0; |
| int maxClientBuffers = SQ_LIO_MAX_BUFFERS; |
| |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcess::Create"; |
| TRACE_ENTRY; |
| |
| result = MPI_SUCCESS; |
| |
| if (Debug) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Starting process through debugger" "\n", method_name, __LINE__); |
| } |
| |
| pid_t os_pid; |
| char sonardir[MAX_PROCESS_PATH]; |
| char user[50]; |
| char filename[MAX_PROCESS_PATH]; |
| char home[MAX_PROCESS_PATH]; |
| char mpiroot[MAX_PROCESS_PATH]; |
| char mpitmpdir[MAX_PROCESS_PATH]; |
| char mpiflags[20]; |
| char mpi_ic_order[10]; |
| char mpi_test_delay[10]; |
| char mpi_error_level[10]; |
| char sq_ic[5]; |
| char term[20]; |
| char tz[100]; |
| bool tz_exists; |
| char xauthority[MAX_PROCESS_PATH]; |
| char *display; |
| char *vnodes; |
| char nsCommPort[10]; |
| char nsSyncPort[10]; |
| char nsMon2NsPort[10]; |
| char nsConfigDb[MAX_PROCESS_PATH]; |
| MON_Props xprops(true); |
| MON_Props xprops_exe(true); |
| char *xprops_exe_file; |
| |
| // get last used default environment |
| env = getenv ("TERM"); |
| STRCPY (term, (env?env:"ansi")); |
| env = getenv ("TZ"); |
| tz_exists = (env != NULL); |
| if (tz_exists) |
| { |
| STRCPY (tz, env); // see note regarding TZ below |
| } |
| env = getenv ("USER"); |
| STRCPY (user, (env?env:"")); |
| env = getenv ("HOME"); |
| STRCPY (home, (env?env:"")); |
| env = getenv ("SONAR_ROOT"); |
| STRCPY (sonardir, (env?env:"")); |
| env = getenv ("MPI_ROOT"); |
| STRCPY (mpiroot, (env?env:"")); |
| env = getenv ("MPI_TMPDIR"); |
| STRCPY (mpitmpdir, (env?env:home)); |
| // strcpy (mpiflags, "l,y0,Eon"); |
| strcpy (mpiflags, "y0"); |
| if (Debug) |
| { |
| strcat(mpiflags,",egdb"); |
| } |
| env = getenv ("MPI_TEST_DELAY"); |
| STRCPY(mpi_test_delay,(env?env:"2")); |
| env = getenv ("MPI_ERROR_LEVEL"); |
| strcpy(mpi_error_level,(env?env:"2")); |
| STRCPY (xauthority, home); |
| strcat (xauthority, "/.Xauthority"); |
| display = getenv ("DISPLAY"); |
| vnodes = getenv("SQ_VIRTUAL_NODES"); |
| env=getenv("SQ_IC"); |
| if(env) |
| { |
| if ((strcmp(env,"IBV")==0) || (strcmp(env,"-IBV")==0)) |
| { |
| strcpy(sq_ic, "-IBV"); |
| strcpy(mpi_ic_order, "IBV"); |
| } |
| else |
| { |
| strcpy(sq_ic, "-TCP"); |
| strcpy(mpi_ic_order, "TCP"); |
| } |
| } |
| else |
| { |
| strcpy(sq_ic, "-TCP"); |
| strcpy(mpi_ic_order, "TCP"); |
| } |
| |
| env = getenv( "SQ_LIO_MAX_BUFFERS" ); |
| if (env) |
| { |
| maxClientBuffers = atoi( env ); |
| } |
| |
| env = getenv( "SQ_LOCAL_IO_SHELL_TRACE" ); |
| if (env && strcmp( env, "1" ) == 0) |
| shellTrace = true; |
| |
| if ( Type == ProcessType_NameServer ) |
| { |
| env = getenv ("NS_COMM_PORT"); |
| STRCPY (nsCommPort, (env?env:"")); |
| env = getenv ("NS_SYNC_PORT"); |
| STRCPY (nsSyncPort, (env?env:"")); |
| env = getenv ("NS_M2N_COMM_PORT"); |
| STRCPY (nsMon2NsPort, (env?env:"")); |
| env = getenv ("SQ_CONFIGDB"); |
| STRCPY (nsConfigDb, (env?env:"")); |
| } |
| if ( Type == ProcessType_Watchdog ) |
| { |
| env = getenv( "WDT_TRACE_CMD" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtTraceCmd = true; |
| env = getenv( "WDT_TRACE_INIT" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtTraceInit = true; |
| env = getenv( "WDT_TRACE_LIO" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtTraceLio = true; |
| env = getenv( "WDT_TRACE_ENTRY_EXIT" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtTraceEntryExit = true; |
| env = getenv( "SQ_WDT_KEEPALIVETIMERVALUE" ); |
| if (env && isdigit(*env)) |
| { |
| wdtKeepAliveTimer = true; |
| keepAliveValue = atoi(env); |
| } |
| env = getenv( "SQ_WDT_MONITOR_PROCESS_CHECKRATE" ); |
| if (env && isdigit(*env)) |
| { |
| wdtMonProcRate = true; |
| monitorCheckRateValue = atoi(env); |
| } |
| env = getenv( "SQ_WDT_LUNMGR_PROCESS_HANGDELAY" ); |
| if (env && isdigit(*env)) |
| { |
| wdtLunmgrHangDelay = true; |
| lunmgrHangDelayValue = atoi(env); |
| } |
| env = getenv( "SQ_LINUX_WATCHDOG" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtLinuxWatchdog = true; |
| env = getenv( "SQ_WDT_STARTUPTIMERVALUE" ); |
| if (env && isdigit(*env)) |
| { |
| wdtStartupTimer = true; |
| startupTimerValue = atoi(env); |
| } |
| env = getenv( "SQ_WDT_DUMP_MONITOR" ); |
| if (env && strcmp( env, "1" ) == 0) |
| wdtDumpMonitor = true; |
| } |
| |
| env = getenv( "SQ_MON_ALTLOG" ); |
| if (env && strcmp( env, "1" ) == 0) |
| monAltLogEnabled = true; |
| |
| env = getenv( "SQ_SEAMONSTER" ); |
| if (env && strcmp( env, "1" ) == 0) |
| seamonsterEnabled = true; |
| |
| env = getenv( "SQ_LIO_PROCESS_THREADS" ); |
| if (env && isdigit(*env)) |
| numProcessThreads = atoi(env); |
| |
| env = getenv( "TRAF_CONF" ); |
| if (env) |
| { |
| trafConf_ = env ; |
| } |
| env = getenv( "TRAF_HOME" ); |
| if (env) |
| { |
| trafHome_ = env ; |
| } |
| env = getenv( "TRAF_VAR" ); |
| if (env) |
| { |
| trafVar_ = env ; |
| } |
| |
| // setup default environment variables from monitor or last CreateProcess call |
| if (maxClientBuffers) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_MAX_BUFFERS", maxClientBuffers ); |
| } |
| if (numProcessThreads) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_PROCESS_THREADS", |
| numProcessThreads ); |
| } |
| if (shellTrace) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_LOCAL_IO_SHELL_TRACE=1" ); |
| } |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_ROOT", mpiroot ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - MPI_ROOT = %s\n", method_name, __LINE__, mpiroot); |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_TMPDIR", mpitmpdir ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - MPI_TMPDIR=%s\n", method_name, __LINE__, |
| mpitmpdir); |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_FLAGS", mpiflags ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - MPI_FLAGS=%s\n", method_name, __LINE__, mpiflags); |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_IC_ORDER", mpi_ic_order ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - MPI_IC_ORDER=%s\n", method_name, __LINE__, |
| mpi_ic_order); |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_TEST_DELAY", mpi_test_delay ); |
| |
| setEnvStrVal ( childEnv, nextEnv, "MPI_ERROR_LEVEL", mpi_error_level ); |
| |
| setEnvStr ( childEnv, nextEnv, "MPI_RDMA_MSGSIZE=32768,131072,4194304" ); |
| |
| setEnvStr ( childEnv, nextEnv, "HPMP_SQ=1" ); |
| |
| setEnvStr ( childEnv, nextEnv, "MALLOC_ARENA_MAX=1" ); |
| |
| setEnvStr ( childEnv, nextEnv, "HPMP_SINGLETON_HA=1" ); |
| |
| if ( strcmp( mpi_ic_order, "IBV" ) == 0 ) |
| { |
| setEnvStr ( childEnv, nextEnv, "MPI_HASIC_IBV=1" ); |
| } |
| |
| if ( Measure == 1 ) |
| { |
| snprintf(filename,sizeof(filename),"%s/%s", mpitmpdir, Name); |
| setEnvStrVal ( childEnv, nextEnv, "MPI_INSTR", filename ); |
| } |
| else if ( Measure == 2 ) |
| { |
| snprintf(filename,sizeof(filename),"%s/%s.cpu:cpu", mpitmpdir, Name); |
| setEnvStrVal ( childEnv, nextEnv, "MPI_INSTR", filename ); |
| } |
| |
| setEnvStrVal ( childEnv, nextEnv, "TRAF_CONF", trafConf_.c_str() ); |
| setEnvStrVal ( childEnv, nextEnv, "TRAF_HOME", trafHome_.c_str() ); |
| setEnvStrVal ( childEnv, nextEnv, "TRAF_VAR", trafVar_.c_str() ); |
| setEnvStrVal ( childEnv, nextEnv, "USER", user ); |
| setEnvStrVal ( childEnv, nextEnv, "HOME", home ); |
| setEnvStrVal ( childEnv, nextEnv, "TERM", term ); |
| if (tz_exists) |
| { |
| // Note that if TZ does not exist, we don't want to set it. |
| // The absence of TZ causes the glib localtime function to |
| // use the local time as defined in /etc/localtime. But, |
| // an invalid TZ setting (such as the empty string) causes |
| // the localtime function to use UTC. So, the semantics of |
| // an unset TZ are not the same as the semantics of |
| // TZ=<empty string>. |
| setEnvStrVal ( childEnv, nextEnv, "TZ", tz ); |
| } |
| setEnvStrVal ( childEnv, nextEnv, "CLASSPATH", getenv("CLASSPATH")); |
| |
| if ( display ) |
| { |
| setEnvStrVal ( childEnv, nextEnv, "DISPLAY", display ); |
| } |
| setEnvStrVal ( childEnv, nextEnv, "XAUTHORITY", xauthority ); |
| setEnvStrVal ( childEnv, nextEnv, "SQ_IC", sq_ic ); |
| if ( vnodes && *vnodes ) |
| { |
| setEnvStrVal ( childEnv, nextEnv, "SQ_VIRTUAL_NODES", vnodes ); |
| setEnvIntVal ( childEnv, nextEnv, "SQ_VIRTUAL_NID", MyPNID ); |
| setEnvIntVal ( childEnv, nextEnv, "SQ_LIO_VIRTUAL_NID", MyPNID ); |
| } |
| |
| if ( Type == ProcessType_NameServer ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_MON_CREATOR=MPIRUN" ); |
| setEnvStr ( childEnv, nextEnv, "SQ_MON_RUN_MODE=AGENT" ); |
| if ( nsCommPort[0] ) |
| setEnvStrVal ( childEnv, nextEnv, "NS_COMM_PORT", nsCommPort ); |
| if ( nsSyncPort[0] ) |
| setEnvStrVal ( childEnv, nextEnv, "NS_SYNC_PORT", nsSyncPort ); |
| if ( nsMon2NsPort[0] ) |
| setEnvStrVal ( childEnv, nextEnv, "NS_M2N_COMM_PORT", nsMon2NsPort ); |
| if (nsConfigDb[0] ) |
| setEnvStrVal ( childEnv, nextEnv, "SQ_CONFIGDB", nsConfigDb ); |
| } |
| if ( Type == ProcessType_Watchdog ) |
| { |
| if ( wdtTraceCmd ) |
| { |
| setEnvStr ( childEnv, nextEnv, "WDT_TRACE_CMD=1" ); |
| } |
| if ( wdtTraceInit ) |
| { |
| setEnvStr ( childEnv, nextEnv, "WDT_TRACE_INIT=1" ); |
| } |
| if ( wdtTraceLio ) |
| { |
| setEnvStr ( childEnv, nextEnv, "WDT_TRACE_LIO=1" ); |
| } |
| if ( wdtTraceEntryExit ) |
| { |
| setEnvStr ( childEnv, nextEnv, "WDT_TRACE_ENTRY_EXIT=1" ); |
| } |
| if ( wdtKeepAliveTimer ) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_KEEPALIVETIMERVALUE", keepAliveValue ); |
| } |
| if ( wdtMonProcRate ) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_MONITOR_PROCESS_CHECKRATE", monitorCheckRateValue ); |
| } |
| if ( wdtLunmgrHangDelay ) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_LUNMGR_PROCESS_HANGDELAY", lunmgrHangDelayValue ); |
| } |
| if ( wdtLinuxWatchdog ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_LINUX_WATCHDOG=1" ); |
| } |
| if ( wdtStartupTimer ) |
| { |
| setEnvIntVal ( childEnv, nextEnv, "SQ_WDT_STARTUPTIMERVALUE", startupTimerValue ); |
| } |
| if ( wdtDumpMonitor ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_WDT_DUMP_MONITOR=1" ); |
| } |
| if ( monAltLogEnabled ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_MON_ALTLOG=1" ); |
| } |
| } |
| if ( Type == ProcessType_PSD || Type == ProcessType_SMS ) |
| { |
| if ( monAltLogEnabled ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_MON_ALTLOG=1" ); |
| } |
| } |
| if ( seamonsterEnabled ) |
| { |
| setEnvStr ( childEnv, nextEnv, "SQ_SEAMONSTER=1" ); |
| } |
| |
| string LDpath; |
| static bool sv_getenv_ld_library_path_done = false; |
| static string sv_ld_library_path; |
| if (IsAgentMode) |
| { |
| if (! sv_getenv_ld_library_path_done) |
| { |
| sv_getenv_ld_library_path_done = true; |
| sv_ld_library_path = getenv( "LD_LIBRARY_PATH" ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, sv_ld_library_path.c_str() ); |
| } |
| } |
| LDpath = sv_ld_library_path; |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, LDpath.c_str() ); |
| } |
| } |
| else |
| { |
| if (ldpathStrId_.nid != -1) |
| { |
| Config->strIdToString( ldpathStrId_, LDpath ); |
| } |
| } |
| if (!LDpath.empty()) |
| { |
| setEnvStrVal( childEnv, nextEnv, "LD_LIBRARY_PATH", LDpath.c_str( ) ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d - LD_LIBRARY_PATH = %s\n", method_name, __LINE__, LDpath.c_str() ); |
| } |
| } |
| |
| setEnvStr ( childEnv, nextEnv, "LD_BIND_NOW=true" ); |
| |
| string program; |
| Config->strIdToString ( programStrId_, program ); |
| // temp for performance investigation |
| if ( strstr(program.c_str(), "tdm_arkcmp") != NULL |
| || strstr(program.c_str(), "tdm_arkesp") != NULL ) |
| { |
| cmpOrEsp_ = true; |
| } |
| // Save actual program filename and set PWD environment variable |
| size_t lastSlash = program.rfind('/'); |
| if (lastSlash == string::npos) |
| { // At top level directory |
| STRCPY(filename, program.c_str()); |
| } |
| else |
| { |
| STRCPY(filename, &program[lastSlash+1]); |
| } |
| if (lastSlash == string::npos || lastSlash == 0) |
| { |
| setEnvStr ( childEnv, nextEnv, "PWD=/" ); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - PWD=/\n", method_name, __LINE__); |
| } |
| else |
| { |
| string pwd = program.substr(0, lastSlash); |
| setEnvStrVal ( childEnv, nextEnv, "PWD", pwd.c_str() ); |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - PWD=%s\n", method_name, __LINE__, |
| pwd.c_str()); |
| } |
| |
| string path; |
| static bool sv_getenv_path_done = false; |
| static string sv_path; |
| if (IsAgentMode) |
| { |
| if (! sv_getenv_path_done) |
| { |
| sv_getenv_path_done = true; |
| sv_path = getenv( "PATH" ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, sv_path.c_str() ); |
| } |
| } |
| path = sv_path; |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() ); |
| } |
| } |
| else |
| { |
| if (pathStrId_.nid != -1) |
| { |
| Config->strIdToString( pathStrId_, path ); |
| } |
| } |
| setEnvStrVal( childEnv, nextEnv, "PATH", path.c_str( ) ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() ); |
| } |
| |
| // Set values from registry as environment variables |
| setEnvFromRegistry ( childEnv, nextEnv ); |
| |
| xprops_exe_file = NULL; |
| xprops.load("mon.env"); |
| MON_Smap_Enum xenum(&xprops); |
| if (xenum.more()) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::Create], Warning: using mon.env.\n"); |
| } |
| while (xenum.more()) |
| { |
| char *xkey = xenum.next(); |
| const char *xvalue = xprops.get(xkey); |
| if (memcmp(xkey, "SQ_PROPS_", 9) == 0) |
| { |
| if (strcasecmp(&xkey[9], filename) == 0) |
| xprops_exe_file = (char *) xvalue; |
| } |
| setEnvStrVal ( childEnv, nextEnv, xkey, xvalue ); |
| if (nextEnv > MAX_CHILD_ENV_VARS) |
| { // Exceeded array size |
| nextEnv = MAX_CHILD_ENV_VARS; |
| break; |
| } |
| if (trace_settings |
| & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - mon.env %s=%s\n", method_name, __LINE__, xkey, |
| xvalue ); |
| } |
| if (xprops_exe_file != NULL) |
| { |
| // load exe-property-file |
| xprops_exe.load(xprops_exe_file); |
| MON_Smap_Enum xenum(&xprops_exe); |
| while (xenum.more()) |
| { |
| char *xkey = xenum.next(); |
| const char *xvalue = xprops_exe.get(xkey); |
| setEnvStrVal ( childEnv, nextEnv, xkey, xvalue ); |
| if (nextEnv > MAX_CHILD_ENV_VARS) |
| { // Exceeded array size |
| nextEnv = MAX_CHILD_ENV_VARS; |
| break; |
| } |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - %s %s=%s\n", method_name, __LINE__, xprops_exe_file, xkey, xvalue); |
| } |
| } |
| // Add environment array terminator required by execve. |
| childEnv[nextEnv] = NULL; |
| ++nextEnv; |
| |
| if ( !SMSIntegrating && Type == ProcessType_SMS && !Clone && !argc_ ) |
| { |
| argv = new char *[13]; |
| } |
| else |
| { |
| argv = new char *[argc_ + 13]; |
| } |
| argv[0] = new char [strlen(filename)+1]; |
| strcpy(argv[0], filename); |
| j = 1; |
| |
| // finish setting up arguments for process after <filename> in argv[0] |
| // "SQMON1.0" <pnid> <nid> <pid> <pname> <port> <ptype> <zid> <verifier> "SPARE" |
| // [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] |
| argv[j] = new char[9]; |
| sprintf (argv[j], "SQMON1.1"); |
| |
| argv[j + 1] = new char[6]; |
| sprintf (argv[j + 1], "%5.5d", MyPNID); |
| |
| argv[j + 2] = new char[6]; |
| sprintf (argv[j + 2], "%5.5d", Nid); |
| |
| argv[j + 3] = new char[7]; |
| //sprintf (argv[j + 3], "%6.6d", Pid); |
| strcpy(argv[j + 3],"??????"); // The Pid will be assigned later, but we can't print it then. |
| |
| argv[j + 4] = new char[strlen(Name) ? strlen(Name)+1 : MAX_PROCESS_NAME_STR]; |
| strcpy (argv[j + 4], Name); |
| |
| argv[j + 5] = new char[strlen (MyCommPort) + 1]; |
| strcpy (argv[j + 5], MyCommPort); |
| |
| argv[j + 6] = new char[6]; |
| sprintf (argv[j + 6], "%5.5d", Type); |
| |
| argv[j + 7] = new char[6]; |
| sprintf (argv[j + 7], "%5.5d", MyNode->GetZone()); |
| |
| SetVerifier(); // CProcess::Create |
| argv[j + 8] = new char[6]; |
| sprintf (argv[j + 8], "%5.5d", Verifier); |
| |
| argv[j + 9] = new char[6]; |
| sprintf (argv[j + 9], "SPARE"); |
| |
| if ( !SMSIntegrating && Type == ProcessType_SMS && !Clone && !argc_ ) |
| { |
| argc_ = 1; |
| argv[j + 10] = new char[7]; |
| sprintf (argv[j + 10], "sminit"); |
| argv[j + 11] = NULL; |
| } |
| else |
| { |
| // now append user args |
| const char *pUserArgv = userArgv_; |
| int arglen; |
| for (i = 0; i < argc_; i++) |
| { |
| arglen = strlen (pUserArgv) + 1; |
| argv[i + j + 10] = new char[arglen]; |
| strcpy (argv[i + j + 10], pUserArgv); |
| pUserArgv += arglen; |
| } |
| argv[i + j + 10] = NULL; |
| } |
| |
| // start process and place in list |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf("%s@%d" " - Program='" "%s" "' argc=" "%d" "\n", method_name, __LINE__, program.c_str(), argc_ + j + 10); |
| i = 0; |
| while (argv[i] != NULL) |
| { |
| trace_printf("%s@%d" " - argv[" "%d" "]=" "%s" "\n", method_name, __LINE__, i, argv[i]); |
| i++; |
| } |
| } |
| |
| // Create pipes for inter-process communication between new process |
| // and the monitor. |
| int pfds_stdin[2]; |
| if (pipe (pfds_stdin)) |
| { // Error creating pipe |
| snprintf(la_buf, sizeof(la_buf), "[%s], stdin pipe error, %s.\n", |
| method_name, strerror(errno)); |
| mon_log_write(MON_PROCESS_CREATE_1, SQ_LOG_ERR, la_buf); |
| pfds_stdin[0] = -1; |
| pfds_stdin[1] = -1; |
| } |
| |
| int pfds_stdout[2]; |
| if (pipe (pfds_stdout)) |
| { // Error creating pipe |
| snprintf(la_buf, sizeof(la_buf), "[%s], stdout pipe error, %s.\n", |
| method_name, strerror(errno)); |
| mon_log_write(MON_PROCESS_CREATE_2, SQ_LOG_ERR, la_buf); |
| pfds_stdout[0] = -1; |
| pfds_stdout[1] = -1; |
| } |
| |
| int pfds_stderr[2]; |
| if (pipe (pfds_stderr)) |
| { // Error creating pipe |
| snprintf(la_buf, sizeof(la_buf), "[%s], stderr pipe error, %s.\n", |
| method_name, strerror(errno)); |
| mon_log_write(MON_PROCESS_CREATE_3, SQ_LOG_ERR, la_buf); |
| pfds_stderr[0] = -1; |
| pfds_stderr[1] = -1; |
| } |
| |
| MemModLock.lock(); |
| |
| // make all child variable accessed only from heap |
| int priority = Priority; |
| |
| #ifdef USE_FORK_SUSPEND_RESUME |
| mon_thread_suspend_all(); |
| #endif // USE_FORK_SUSPEND_RESUME |
| |
| sigset_t forkMask; |
| sigset_t oldMask; |
| sigemptyset(&forkMask); |
| sigaddset(&forkMask, SIGCHLD); |
| rc = pthread_sigmask(SIG_BLOCK, &forkMask, &oldMask); |
| if (rc != 0) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], pthread_sigmask() error: %s (%d)\n", |
| method_name, strerror(rc), rc ); |
| mon_log_write(MON_PROCESS_CREATE_4, SQ_LOG_ERR, la_buf); |
| } |
| |
| // this pipe is used to tell the child to go away if monitor detects |
| // a duplicate pid. This can occur if there is a pending child death signal. |
| int pipefd[2]; |
| pipe(pipefd); |
| bool childGoAway = false; |
| |
| SetCreationTime(-1); |
| os_pid = fork (); |
| if (os_pid == -1) |
| { |
| // can't start a process |
| rc = result = MPI_ERR_SPAWN; |
| } |
| else if (os_pid) |
| { |
| // I am monitor |
| |
| rc = pthread_sigmask(SIG_SETMASK, &oldMask, NULL); |
| if (rc != 0) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], pthread_sigmask() error: %s (%d)\n", |
| method_name, strerror(rc), rc ); |
| mon_log_write(MON_PROCESS_CREATE_5, SQ_LOG_ERR, la_buf); |
| } |
| |
| // check if process already exists with the same pid. |
| if (MyNode->GetProcess(os_pid) != NULL) |
| { |
| rc = result = MPI_ERR_SPAWN; |
| // tell the child to go away |
| childGoAway = true; |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], pid already exists, aborting process create: pid = %d\n", |
| method_name, os_pid ); |
| mon_log_write(MON_PROCESS_CREATE_4, SQ_LOG_ERR, la_buf); |
| } |
| |
| // tell the child to stay or go away |
| close(pipefd[0]); // close the read-end of the pipe, not going to use |
| write(pipefd[1], &childGoAway, sizeof(childGoAway)); |
| close(pipefd[1]); // close the write-end of the pipe, sending EOF. |
| |
| if (childGoAway) |
| { // no need to continue connecting with child |
| goto forkExit; |
| } |
| |
| // I'm the monitor ... connect to child |
| rc = MPI_SUCCESS; |
| |
| // Save process id and build process name if not already named |
| Pid = os_pid; |
| if (Name[0] == '\0') |
| { // No name assigned to the process so generate one based on |
| // the node-id and process-id. |
| MyNode->BuildOurName(Nid, os_pid, Name); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - No process name specified, generated name=%s for process (%d, %d)\n", method_name, __LINE__, Name, Nid, os_pid); |
| } |
| |
| if (NameServerEnabled && tag != NULL) |
| { |
| // Send actual pid and process name back to parent |
| // STDIO Redirection requires that clone process in parent node |
| // have the actual pid |
| rc2 = PtpClient->ProcessInit( this |
| , tag |
| , 0 |
| , parent->Nid ); |
| if (rc2) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CLNode *parentLNode = NULL; |
| parentLNode = Nodes->GetLNode( parent->Nid ); |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process create success " |
| "for process %s (%d, %d) " |
| "to parent node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , parentLNode->GetNode()->GetName() |
| , parentLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_CREATE_12, SQ_LOG_ERR, la_buf); |
| } |
| } |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d Process=%s, Infile=[%s], Outfile=[%s]\n", |
| method_name, __LINE__, Name, infile_.c_str(), |
| outfile_.c_str()); |
| |
| // stdin pipe to child: |
| // We don't need read end of pipe. |
| // Add the write end of file descriptor to list of file |
| // descriptors monitored. |
| if (pfds_stdin[1] != -1) |
| { |
| close(pfds_stdin[0]); |
| |
| // Decide on standard input source for the |
| // process. It will either be a filename on this node |
| // or handled by a specific process on another node. |
| int AncestorNid = -1; |
| int AncestorPid = -1; |
| char Stdfile[MAX_PROCESS_PATH]; |
| if (PickStdfile(PICK_STDIN, Stdfile, AncestorNid, AncestorPid)) |
| { |
| Redirector.stdinFd(Nid, os_pid, pfds_stdin[1], Stdfile, |
| AncestorNid, AncestorPid); |
| fd_stdin_ = pfds_stdin[1]; |
| } |
| else |
| { |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d Unable to find stdin file for " |
| "Process=%s, pid=%d. Closing stdin pipe " |
| "fd=%d\n", method_name, __LINE__, Name, |
| os_pid, pfds_stdin[1]); |
| close ( pfds_stdin[1] ); |
| } |
| } |
| |
| // stdout pipe to child: |
| // We don't need write end of pipe. |
| // Add the read end of file descriptor to list of file |
| // descriptors monitored. |
| if (pfds_stdout[0] != -1) |
| { |
| close(pfds_stdout[1]); |
| |
| // Decide on standard output destination for the |
| // process. It will either be a filename on this node |
| // or handled by a specific process on another node. |
| int AncestorNid = -1; |
| int AncestorPid = -1; |
| char Stdfile[MAX_PROCESS_PATH]; |
| if (!PickStdfile(PICK_STDOUT, Stdfile, AncestorNid, AncestorPid)) |
| { // Unable to locate stdout file. So create a file based |
| // on the process name and use that for output. |
| strcpy(Stdfile, "stdout_"); |
| strcat(Stdfile, Name); |
| if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) |
| trace_printf("%s@%d Unable to find stdout file for " |
| "process=%s, pid=%d. Using file %s for stdout.\n", |
| method_name, __LINE__, Name, os_pid, Stdfile); |
| } |
| Redirector.stdoutFd(Nid, os_pid, pfds_stdout[0], Stdfile, |
| AncestorNid, AncestorPid); |
| |
| fd_stdout_ = pfds_stdout[0]; |
| } |
| |
| // stderr pipe to child: |
| // We don't need write end of pipe. |
| // Add the read end of file descriptor to list of file |
| // descriptors monitored. |
| if (pfds_stderr[0] != -1) |
| { |
| close(pfds_stderr[1]); |
| Redirector.stderrFd(MyNode->GetHostname(), Name, Nid, os_pid, pfds_stderr[0]); |
| fd_stderr_ = pfds_stderr[0]; |
| } |
| |
| forkExit: |
| // release fork semaphore so child can get it |
| if ( sem_post(MyNode->GetMutex()) == -1 ) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::Create], Parent can't put mutex.\n"); |
| mon_log_write(MON_PROCESS_CREATE_7, SQ_LOG_ERR, la_buf); |
| } |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Parent put mutex so child can proceed.\n", |
| method_name, __LINE__); |
| |
| #ifdef USE_FORK_SUSPEND_RESUME |
| mon_thread_resume_suspended(); |
| #endif // USE_FORK_SUSPEND_RESUME |
| |
| } |
| // LCOV_EXCL_START |
| // Exclude the following from monitor code coverage measurement since |
| // it is executed by a child process not the monitor process. |
| else |
| { |
| // I'm the child process |
| |
| // Take fork semaphore. We need to wait until parent indicates |
| // it is ok to proceed. Pipes between parent and child need to |
| // be set up before child can continue. |
| bool sem_log_error = false; |
| int sem_rc; |
| int err = 0; |
| struct timeval logTime; |
| struct tm *ltime; |
| struct timespec ts; |
| |
| if (clock_gettime(CLOCK_REALTIME, &ts) == -1) |
| { |
| err = errno; |
| gettimeofday(&logTime, NULL); |
| ltime = localtime(&logTime.tv_sec); |
| snprintf(la_buf, sizeof(la_buf), |
| "%02d/%02d/%02d-%02d:%02d:%02d " |
| "[CProcess::Create], clock_gettime(CLOCK_REALTIME)," |
| " Child can't get time, %s (%d), program %s, (pid=%d).\n" |
| , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec |
| , strerror(err), err |
| , filename, getpid()); |
| write (2, la_buf, strlen(la_buf)); |
| } |
| ts.tv_sec += 1; |
| |
| env = getenv( "MON_CREATE_SEM_DELAY" ); |
| if (env && isdigit(*env)) |
| { |
| ts.tv_sec = atol(env); |
| } |
| |
| env = getenv( "MON_CREATE_SEM_LOG_ERROR" ); |
| if (env && isdigit(*env)) |
| { |
| int val = atoi(env); |
| sem_log_error = (val != 0) ? true : false; |
| } |
| do |
| { |
| sem_rc = sem_timedwait(MyNode->GetMutex(), &ts); |
| err = errno; |
| if ( sem_log_error && err == ETIMEDOUT ) |
| { |
| gettimeofday(&logTime, NULL); |
| ltime = localtime(&logTime.tv_sec); |
| snprintf(la_buf, sizeof(la_buf), |
| "%02d/%02d/%02d-%02d:%02d:%02d " |
| "[CProcess::Create], Child can't take mutex," |
| " %s (%d), program %s, (pid=%d).\n" |
| , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec |
| , strerror(err), err |
| , filename, getpid()); |
| write (2, la_buf, strlen(la_buf)); |
| } |
| } |
| while (sem_rc == -1 && (err == EINTR || err == ETIMEDOUT)); |
| |
| if ( sem_log_error && sem_rc == -1 && !(err == EINTR || err == ETIMEDOUT)) |
| { |
| gettimeofday(&logTime, NULL); |
| ltime = localtime(&logTime.tv_sec); |
| snprintf(la_buf, sizeof(la_buf), |
| "%02d/%02d/%02d-%02d:%02d:%02d " |
| "[CProcess::Create], Child can't take mutex," |
| " %s (%d), program %s, (pid=%d).\n" |
| , ltime->tm_mon+1, ltime->tm_mday, ltime->tm_year-100, ltime->tm_hour, ltime->tm_min, ltime->tm_sec |
| , strerror(errno), errno |
| , filename, getpid()); |
| write (2, la_buf, strlen(la_buf)); |
| } |
| |
| // check if monitor wanted child to stay or go away |
| close(pipefd[1]); // close the write-end, not going to use |
| // read till EOF |
| while (read(pipefd[0], &childGoAway, sizeof(childGoAway)) > 0); |
| close(pipefd[0]); // close the read-end of the pipe |
| |
| if (childGoAway) |
| { |
| _exit( ENOEXEC ); |
| } |
| |
| // set the process's process id to the os process id for compatability |
| pid_t myPid = getpid(); |
| sprintf (argv[j + 3], "%6.6d", myPid); |
| |
| char *pName = argv[j + 4]; |
| if (pName[0] == '\0') |
| { // No name assigned to the process so generate one based on |
| // the node-id and process-id. |
| MyNode->BuildOurName(Nid, myPid, pName); |
| } |
| |
| // Unmask all allowed signals in the child |
| // except SIGUSR1 |
| sigset_t mask; |
| sigemptyset(&mask); |
| sigaddset(&mask, SIGUSR1); |
| rc = pthread_sigmask(SIG_SETMASK, &mask, NULL); |
| if (rc != 0) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::Create], pthread_sigmask() error:" |
| " %s (%d), program %s.\n", strerror(rc), rc, filename); |
| write (2, la_buf, strlen(la_buf)); |
| } |
| |
| // set child process's priority based on minimums and specified value |
| nice(priority); |
| |
| // Redirect standard input, standard output, standard error |
| RedirectStdFiles(pfds_stdin, pfds_stdout, pfds_stderr); |
| |
| // Close file descriptors opened by the monitor parent except |
| // for stdin, stdout, stderr. |
| MyNode->close_fds (); |
| |
| char *name; |
| size_t pathlen; |
| |
| // Get program search path |
| pathlen = path.length(); |
| |
| size_t len; |
| len = strlen(filename) + 1; |
| |
| // Allocate space to hold the pathnames + filename |
| size_t alloclen; |
| alloclen = pathlen + len + 1; |
| name = new char[alloclen]; |
| |
| // Place the program filename at the end of the buffer preceeded |
| // by a slash. |
| name = (char *) memcpy(name + pathlen + 1, filename, len); |
| *--name = '/'; |
| |
| // Try to find the program in the directories specified by PATH. |
| // Each element of the path is tried until we find the program |
| // or run out of elements to try. |
| const char *pEnd; |
| const char *pStart; |
| |
| pEnd = path.c_str(); |
| do |
| { |
| char *startp; |
| |
| pStart = pEnd; |
| pEnd = strchr(pStart, ':'); |
| if (!pEnd) |
| pEnd = strchr(pStart, '\0'); |
| |
| if (pEnd == pStart) |
| // Two adjacent colons, or a colon at the beginning or the end |
| // of `PATH' means to search the current directory. |
| startp = name + 1; |
| else |
| // Copy the next path into the buffer just before the |
| // program filename. |
| startp = (char *) memcpy(name - (pEnd - pStart), pStart, pEnd - pStart); |
| |
| // Try to execute this name. If it works, execve will not return. |
| execve( startp, argv, childEnv); |
| |
| switch (errno) |
| { |
| case EACCES: |
| case ENOENT: |
| case ESTALE: |
| case ENOTDIR: |
| case ENODEV: |
| case ETIMEDOUT: |
| case ENOEXEC: |
| // Those errors indicate the file is missing or not |
| // executable by us, in which case we want to just try |
| // the next path directory. |
| break; |
| |
| default: |
| // Some other error means we found an executable file, but |
| // something went wrong executing it; return the error to |
| // our caller. |
| goto execFailed; |
| } |
| } while (*pEnd++ != '\0'); |
| |
| execFailed: |
| // The specified program could not be executed. Note that at this |
| // point we are executing as the child process. We will exit and |
| // the monitor will get a "child death" signal and take the |
| // appropriate actions. |
| // |
| // It's probably not possible to log an error at this point |
| // since the error logging mechanism is probably not available |
| // at this early stage of child process startup. We can write to |
| // the standard error file descriptor since the monitor has set that |
| // up as a pipe back to itself. |
| |
| snprintf(la_buf, sizeof(la_buf), |
| "Unable to execute program %s, %s (%d).\n", |
| filename, strerror(errno), errno); |
| write (2, la_buf, strlen(la_buf)); |
| |
| _exit( errno ); |
| } |
| // LCOV_EXCL_STOP |
| |
| MemModLock.unlock(); |
| |
| if (rc == MPI_SUCCESS && result == MPI_SUCCESS) |
| { |
| successful = true; |
| PidAtFork_ = os_pid; |
| |
| // Indicate that process exists but has not yet completed initialization. |
| State_ = State_Initializing; |
| |
| MyNode->SetAffinity( this ); |
| |
| if (Backup) |
| { |
| if ( !parent ) |
| { // Unexpectedly have null parent pointer |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::CProcess], No Primary for Backup process!\n"); |
| mon_log_write(MON_PROCESS_PROCESS_2, SQ_LOG_ERR, la_buf); |
| } |
| else if (strcmp (parent->Name, Name) != 0) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::Create], Primary & Backup process name " |
| "don't match!\n"); |
| mon_log_write(MON_PROCESS_CREATE_10, SQ_LOG_ERR, la_buf); |
| } |
| else |
| { |
| // primary & backup processes are parent's of each other |
| parent->Parent_Nid = Nid; |
| parent->Parent_Pid = Pid; |
| parent->Backup = false; |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Assigning parent nid=" "%d" ", pid=" "%d" " with (really child) parent nid=" "%d" ", parent pid=" "%d" "\n", method_name, __LINE__, parent->Nid, parent->Pid, Nid, Pid); |
| } |
| } |
| |
| if ( Backup ) |
| { // For a backup process the "parent" is the CProcess object |
| // for the primary process. So find the real parent process |
| // object. |
| parent = Nodes->GetLNode ( PairParentNid ) |
| ->GetProcessL( PairParentPid ); |
| } |
| |
| if ( !UnHooked && parent && !Backup ) |
| { // Parent process object keeps track of child processes |
| // created on this node. Needed in case parent process |
| // exits abnormally. |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL |
| | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d - Child process %s (%d, %d) added to " |
| "parent %s (%d, %d)\n", method_name, __LINE__, |
| Name, Nid, Pid, parent->GetName(), |
| parent->GetNid(), parent->GetPid()); |
| |
| parent->childAdd ( Nid, Pid ); |
| |
| } |
| |
| Monitor->writeProcessMapBegin( Name, Nid, Pid, Verifier, |
| parent ? parent->GetNid() : -1, |
| parent ? parent->GetPid() : -1, |
| parent ? parent->GetVerifier() : -1, |
| program.c_str() ); |
| } |
| else |
| { |
| successful = false; |
| result = MPI_ERR_SPAWN; |
| |
| if (NameServerEnabled) |
| { |
| rc2 = PtpClient->ProcessInit( this |
| , tag |
| , result |
| , parent->Nid ); |
| if (rc2) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CLNode *parentLNode = NULL; |
| parentLNode = Nodes->GetLNode( parent->Nid ); |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process create failure " |
| "for process %s (%d, %d) " |
| "result to parent node %s, nid=%d, result=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , parentLNode->GetNode()->GetName() |
| , parentLNode->GetNid(), result ); |
| mon_log_write(MON_PROCESS_CREATE_13, SQ_LOG_ERR, la_buf); |
| } |
| } |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[CProcess::Create], Failed to start process %s path= %s.\n", Name, path.c_str()); |
| mon_log_write(MON_PROCESS_CREATE_11, SQ_LOG_ERR, buf); |
| } |
| |
| // release allocated memory |
| for (i = 0; argv[i]; i++) |
| { |
| delete [] argv[i]; |
| } |
| delete [] argv; |
| |
| for (i = 0; childEnv[i]; i++) |
| { |
| delete [] childEnv[i]; |
| } |
| |
| TRACE_EXIT; |
| |
| return successful; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcess::Dump (CProcess *dumper, char *core_path) |
| { |
| bool status = FAILURE; |
| CReplDump *repl; |
| |
| const char method_name[] = "CProcess::Dump"; |
| TRACE_ENTRY; |
| |
| switch (DumpState) |
| { |
| case Dump_Ready: |
| DumpState = Dump_Pending; |
| dumpFile_ = core_path; |
| DumperNid = dumper->Nid; |
| DumperPid = dumper->Pid; |
| DumperVerifier = dumper->Verifier; |
| status = SUCCESS; |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d - DumpState=Dump_Pending, pid=%d\n", |
| method_name, __LINE__, Pid); |
| repl = new CReplDump(this); |
| Replicator.addItem(repl); |
| break; |
| |
| default: |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d - Dump already in progress, pid=%d\n", |
| method_name, __LINE__, Pid); |
| break; |
| } |
| |
| TRACE_EXIT; |
| |
| return status; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| static void cprocess_dump_cb(void *ctx, pid_t pid, int status) |
| { |
| CLNode *lnode = static_cast<CLNode *>(ctx); |
| lnode->DumpCallback( lnode->GetNid(), pid, status ); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::DumpBegin (int nid, int pid, Verifier_t verifier, char *core_path) |
| { |
| char *argv[6]; |
| char *cmd; |
| char core_file[MAX_PROCESS_PATH]; |
| char core_pid[20]; |
| char date[20]; |
| int err; |
| struct timeval tv; |
| struct tm tx; |
| |
| const char method_name[] = "CProcess::DumpBegin"; |
| TRACE_ENTRY; |
| |
| DumperNid = nid; |
| DumperPid = pid; |
| DumperVerifier = verifier; |
| if (Clone) |
| { |
| DumpState = Dump_InProgress; |
| } |
| else |
| { |
| // Increment reference count for process object until DumpEnd |
| incrReplRef(); |
| |
| gettimeofday(&tv, NULL); |
| localtime_r(&tv.tv_sec, &tx); |
| snprintf(date, sizeof(date), "%d-%02d-%02d_%02d-%02d-%02d", |
| tx.tm_year + 1900, |
| tx.tm_mon + 1, |
| tx.tm_mday, |
| tx.tm_hour, |
| tx.tm_min, |
| tx.tm_sec); |
| |
| string program; |
| Config->strIdToString ( programStrId_, program ); |
| |
| cmd = rindex((char *) program.c_str(), '/'); |
| if (cmd == NULL) |
| cmd = (char *) program.c_str(); |
| else |
| cmd++; // past '/' |
| // date=%Y-%m-%d_%H-%M-%S |
| // core_file=<path>/core.<date>.<pname>.<pid>.<cmd> |
| snprintf(core_file, sizeof(core_file), "%s/core.%s.%s.%d.%s", |
| core_path, |
| date, |
| &Name[1], |
| Pid, |
| cmd); |
| corefile_ = core_file; |
| |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d - starting mondump for pid=%d, core-file=%s\n", |
| method_name, __LINE__, Pid, core_file); |
| |
| argv[0] = (char *) "mondump"; |
| snprintf(core_pid, sizeof(core_pid), "%d", Pid); |
| argv[1] = core_pid; |
| argv[2] = core_file; |
| if ((nid == Nid) || getenv("SQ_VIRTUAL_NODES")) |
| argv[3] = NULL; |
| else |
| { |
| argv[3] = (char *) Nodes->GetNode(Nid)->GetName(); |
| argv[4] = getenv("MPI_TMPDIR"); |
| argv[5] = NULL; |
| } |
| CLNode *lnode = Nodes->GetLNode( Nid ); |
| err = IntProcess.create(argv[0], |
| argv, |
| cprocess_dump_cb, // cb |
| Pid, // cb_pid |
| lnode, // cb_ctx |
| NULL); |
| if (err == 0) |
| { |
| dumpFile_ = core_file; |
| DumpState = Dump_InProgress; |
| } |
| else |
| { |
| DumpState = Dump_Complete; |
| CReplDumpComplete *repl = new CReplDumpComplete(this); |
| Replicator.addItem(repl); |
| CompleteDump(Dump_Failed, NULL); |
| } |
| } |
| |
| if (trace_settings & TRACE_PROCESS) |
| { |
| if (DumpState == Dump_InProgress) |
| trace_printf("%s@%d - DumpState=Dump_InProgress, pid=%d\n", |
| method_name, __LINE__, Pid); |
| else |
| trace_printf("%s@%d - DumpState=Dump_Complete, pid=%d\n", |
| method_name, __LINE__, Pid); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| const char *DumpStateString( DUMPSTATE state) |
| { |
| const char *str; |
| |
| switch( state ) |
| { |
| case Dump_Unknown: |
| str = "Dump_Unknown"; |
| break; |
| case Dump_Ready: |
| str = "Dump_Ready"; |
| break; |
| case Dump_Pending: |
| str = "Dump_Pending"; |
| break; |
| case Dump_InProgress: |
| str = "Dump_InProgress"; |
| break; |
| case Dump_Complete: |
| str = "Dump_Complete"; |
| break; |
| default: |
| str = "DumpState - Undefined"; |
| break; |
| } |
| |
| return( str ); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::DumpEnd (DUMPSTATUS status, char *core_file) |
| { |
| const char method_name[] = "CProcess::DumpEnd"; |
| TRACE_ENTRY; |
| |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d - name=%s, DumpState=%s, DumpStatus=%d, pid=%d, core_file=%s\n", |
| method_name, __LINE__, Name, DumpStateString(DumpState), status, Pid, core_file); |
| |
| if ( DumpState != Dump_Ready ) |
| { |
| CompleteDump(status, core_file); |
| } |
| |
| // Decrement reference count for process object |
| decrReplRef(); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| struct message_def * CProcess::DeathMessage( ) |
| { |
| struct message_def *msg; |
| |
| const char method_name[] = "CProcess::DeathMessage"; |
| TRACE_ENTRY; |
| |
| // Record statistics (sonar counters) |
| if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED)) |
| MonStats->notice_death_Incr(); |
| |
| msg = new struct message_def; |
| msg->type = MsgType_ProcessDeath; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| msg->u.request.u.death.nid = Nid; |
| msg->u.request.u.death.pid = Pid; |
| msg->u.request.u.death.verifier = Verifier; |
| msg->u.request.u.death.trans_id.txid[0] = 0; |
| msg->u.request.u.death.trans_id.txid[1] = 0; |
| msg->u.request.u.death.trans_id.txid[2] = 0; |
| msg->u.request.u.death.trans_id.txid[3] = 0; |
| msg->u.request.u.death.aborted = IsAbended(); |
| strcpy(msg->u.request.u.death.process_name, Name); |
| msg->u.request.u.death.type = Type; |
| #ifdef USE_SEQUENCE_NUM |
| msg->u.request.u.death.seqnum = Monitor->GetTimeSeqNum(); |
| #endif |
| |
| if (trace_settings & ( TRACE_TMSYNC | TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Death notice for process %s (%d, %d)\n", |
| method_name, __LINE__, Name, Nid, Pid ); |
| TRACE_EXIT; |
| |
| return msg; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::Exit( CProcess *parent ) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcess::Exit"; |
| TRACE_ENTRY; |
| |
| if ( DumpState != Dump_Ready ) |
| { |
| DumpEnd( Dump_Failed, (char *)corefile_.c_str() ); |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d" " - Process %s (%d,%d:%d) is exiting, parent process %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , GetName(), GetNid(), GetPid(), GetVerifier() |
| , parent?parent->GetName():"" |
| , parent?parent->GetNid():-1 |
| , parent?parent->GetPid():-1 |
| , parent?parent->GetVerifier():-1 ); |
| |
| SetState(State_Stopped); |
| |
| if (!Clone && parent && NameServerEnabled) |
| { |
| if (parent->GetNid() != GetNid()) |
| { // parent is remote |
| if (parent->childCount() == 0) |
| { // process is parent's last child |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Parent's last child, deleting clone process %s, (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() ); |
| } |
| Nodes->DeleteCloneProcess( parent ); |
| parent = NULL; |
| } |
| else |
| { |
| ProcessInfoNs_reply_def processInfo; |
| int rc = Nodes->GetProcessInfoNs( parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() |
| , &processInfo); |
| if (rc == MPI_ERR_NAME) |
| { // parent exited |
| if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) |
| { |
| trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() ); |
| } |
| Nodes->DeleteCloneProcess( parent ); |
| parent = NULL; |
| } |
| } |
| } |
| } |
| |
| // if the env is set to not deliver death messages upon node down, |
| // check the state of the process' node. |
| bool supplyProcessDeathNotices = true; |
| if (!Monitor->IsNodeDownDeathNotices()) |
| { |
| CNode * node = Nodes->GetLNode(GetNid())->GetNode(); |
| // if process' node is being killed, do not supply process death notices |
| supplyProcessDeathNotices = node->IsSoftNodeDown() |
| ? node->IsSoftNodeDown() |
| : !node->IsKillingNode(); |
| } |
| |
| if( NoticeHead && |
| !MyNode->IsKillingNode() && |
| !(Type == ProcessType_DTM && IsAbended()) && |
| supplyProcessDeathNotices ) |
| { |
| if ( !Clone && NameServerEnabled ) |
| { |
| // Notify all remote registered nodes of this process' death |
| NoticeHead->NotifyRemote(); |
| } |
| // Notify all local registered processes of this process' death |
| NoticeHead->NotifyAll(); |
| } |
| |
| if ( !Clone && !Paired ) |
| { |
| switch (Type) |
| { |
| case ProcessType_TSE: |
| case ProcessType_ASE: |
| MyNode->delFromQuiesceExitPids( GetPid(), GetVerifier() ); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s%d: pid %d deleted from quiesce exit list\n", method_name, __LINE__, GetPid()); |
| |
| if (MyNode->isInQuiesceState()) |
| { |
| if (MyNode->isQuiesceExitPidsEmpty()) |
| { |
| HealthCheck.setState(MON_SCHED_NODE_DOWN); // schedule a node down req |
| } |
| } |
| else |
| { // unmount volumes only if node is not quiescing. |
| Devices->UnMountVolume( Name, Backup ); |
| } |
| break; |
| case ProcessType_DTM: |
| if ( IsAbended() ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - DTM abended %s (%d, %d:%d)\n" |
| , method_name, __LINE__, Name, Nid, Pid, Verifier); |
| if ( !MyNode->IsKillingNode() && |
| !IsPersistent() && |
| MyNode->GetShutdownLevel() != ShutdownLevel_Abrupt ) |
| { |
| MyNode->SetDTMAborted( true ); |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - DTM stopped normally" "\n", method_name, __LINE__); |
| if ( !MyNode->IsKillingNode() && |
| !IsPersistent() && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined ) |
| { |
| MyNode->SetDTMAborted( true ); |
| } |
| else |
| { |
| if ( Monitor->GetTmLeader() == MyPNID ) |
| { |
| // set the clean shutdown condition |
| char key[MAX_KEY_NAME]; |
| char value[10]; |
| strcpy(key,"Clean_Shutdown"); |
| strcpy(value,"True"); |
| Config->GetClusterGroup()->Set( key, value ); |
| } |
| } |
| } |
| break; |
| case ProcessType_SMS: |
| if ( IsAbended() ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - SMS abended %s (%d, %d:%d)\n" |
| , method_name, __LINE__, Name, Nid, Pid, Verifier); |
| if ( !MyNode->IsKillingNode() && |
| MyNode->GetShutdownLevel() != ShutdownLevel_Abrupt ) |
| { |
| MyNode->SetSMSAborted( true ); |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - SMS stopped normally" "\n", method_name, __LINE__); |
| if ( !MyNode->IsKillingNode() && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined ) |
| { |
| MyNode->SetSMSAborted( true ); |
| } |
| } |
| break; |
| case ProcessType_NameServer: |
| if ( IsAbended() ) |
| { |
| if (!Clone) |
| { |
| NameServer->NameServerExited(); |
| } |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - NameServer abended" "\n", method_name, __LINE__); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - NameServer stopped normally" "\n", method_name, __LINE__); |
| } |
| break; |
| case ProcessType_Watchdog: |
| if ( IsAbended() ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Watchdog abended" "\n", method_name, __LINE__); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Watchdog stopped normally" "\n", method_name, __LINE__); |
| } |
| break; |
| case ProcessType_MXOSRVR: |
| case ProcessType_Generic: |
| if ( MyNode->GetState() == State_Up && |
| !MyNode->IsKillingNode() && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined ) |
| { |
| // Send logical node's SSMP process this process' death message |
| CLNode *lnode = MyNode->GetLNode( Nid ); |
| if ( lnode ) |
| { |
| CProcess *ssmpProcess = lnode->GetSSMProc(); |
| if ( ssmpProcess && Pid != -1 ) |
| { |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d: Queueing death notice for SSMP process for %s (%d, %d:%d)\n", |
| method_name, __LINE__, Name, Nid, Pid, Verifier); |
| |
| ssmpProcess->ssmpNoticesLock_.lock(); |
| ssmpProcess->ssmpNotices_.push_back( DeathMessage() ); |
| ssmpProcess->ssmpNoticesLock_.unlock(); |
| SQ_theLocalIOToClient->nudgeNotifier (); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL | TRACE_NOTICE )) |
| trace_printf("%s@%d: No SSMP process found in nid=%d\n", |
| method_name, __LINE__, lnode->GetNid()); |
| } |
| } |
| } |
| break; |
| |
| case ProcessType_SSMP: |
| // Indicate no SSM process on this node. |
| Nodes->GetLNode ( Nid )->SetSSMProc ( NULL ); |
| break; |
| |
| case ProcessType_AMP: |
| case ProcessType_Backout: |
| case ProcessType_VolumeRecovery: |
| case ProcessType_SPX: |
| case ProcessType_PSD: |
| case ProcessType_PERSIST: |
| // No special handling needed on exit |
| break; |
| default: |
| |
| snprintf(la_buf, sizeof(la_buf), |
| "[CProcess::Exit], Invalid process type!\n"); |
| mon_log_write(MON_PROCESS_EXIT_1, SQ_LOG_ERR, la_buf); |
| } |
| |
| // Remove this child process from parent's child-process-list. |
| if ( (parent != NULL) && (parent->GetState() == State_Up) ) |
| { |
| parent->childRemove( Nid, Pid); |
| if (NameServerEnabled) |
| { |
| parent->childUnHookedRemove( Nid, Pid); |
| } |
| } |
| |
| // Check if we need to output a entry into the process id map log file |
| if ( PidMap ) |
| { |
| Monitor->writeProcessMapEnd( Name, Nid, Pid, Verifier, |
| parent ? parent->GetNid() : -1, |
| parent ? parent->GetPid() : -1, |
| parent ? parent->GetVerifier() : -1, |
| program() ); |
| } |
| } |
| |
| if ( Clone && Pid != -1 ) |
| { |
| if ( Type == ProcessType_SPX && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined && |
| supplyProcessDeathNotices ) |
| { |
| // Send local SPX this SPX's death message |
| CLNode *lnode = MyNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *spxProcess = lnode->GetProcessLByType( ProcessType_SPX ); |
| if ( spxProcess && MyNode->GetState() == State_Up ) |
| { |
| SQ_theLocalIOToClient->putOnNoticeQueue( spxProcess->Pid |
| , spxProcess->Verifier |
| , DeathMessage() |
| , NULL); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , GetName(), GetNid(), GetPid(), GetVerifier() |
| , spxProcess->GetName(), spxProcess->GetNid() |
| , spxProcess->GetPid(), spxProcess->GetVerifier()); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d: No SPX process found in nid=%d\n", |
| method_name, __LINE__, lnode->GetNid()); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf( "%s@%d" " - Death message check of %s (%d,%d:%d) type=%s, node phase=%s, send death notices=%d\n" |
| , method_name, __LINE__ |
| , GetName(), GetNid(), GetPid(), GetVerifier() |
| , ProcessTypeString(GetType()), NodePhaseString( MyNode->GetPhase() ) |
| , supplyProcessDeathNotices ); |
| |
| if ( Type == ProcessType_DTM && |
| MyNode->GetPhase() == Phase_Ready && |
| supplyProcessDeathNotices ) |
| { |
| // Send local DTMs this DTM's death message |
| CLNode *lnode = MyNode->GetFirstLNode(); |
| for ( ; lnode; lnode = lnode->GetNextP() ) |
| { |
| CProcess *tmProcess = lnode->GetProcessLByType( ProcessType_DTM ); |
| if ( tmProcess && MyNode->GetState() == State_Up ) |
| { |
| SQ_theLocalIOToClient->putOnNoticeQueue( tmProcess->Pid |
| , tmProcess->Verifier |
| , DeathMessage() |
| , NULL); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , GetName(), GetNid(), GetPid(), GetVerifier() |
| , tmProcess->GetName(), tmProcess->GetNid() |
| , tmProcess->GetPid(), tmProcess->GetVerifier()); |
| |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d: No DTM process found in nid=%d\n", |
| method_name, __LINE__, lnode->GetNid()); |
| } |
| } |
| } |
| } |
| |
| if ( parent && !parent->IsClone() && Pid != -1 ) |
| { |
| |
| // If process and parent are DTMs suppress death |
| // message here, it was delivered above |
| if ( parent->IsSystemMessages() && |
| parent->GetState() == State_Up && |
| !MyNode->IsKillingNode() && |
| !(GetType() == ProcessType_DTM && |
| parent->GetType() == ProcessType_DTM) && |
| supplyProcessDeathNotices ) |
| { |
| SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid |
| , parent->Verifier |
| , DeathMessage() |
| , NULL); |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d) \n" |
| , method_name, __LINE__ |
| , GetName(), GetNid(), GetPid(), GetVerifier() |
| , parent->GetName(), parent->GetNid() |
| , parent->GetPid(), parent->GetVerifier()); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Parent doesn't want Death message" "\n", method_name, __LINE__); |
| } |
| } |
| |
| if (NameServerEnabled) |
| { |
| if ( parent ) |
| { |
| if ( parent->IsClone() && Pid != -1 ) |
| { |
| int targetNid = parent->GetNid(); |
| CLNode *targetLNode = Nodes->GetLNode( targetNid ); |
| // Send the process exit to the parent node |
| int rc = PtpClient->ProcessExit( this |
| , targetNid |
| , targetLNode->GetNode()->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process exit " |
| "for process %s (%d, %d) " |
| "to parent node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , targetLNode->GetNode()->GetName() |
| , targetLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_PROCEXIT_1, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| else |
| { |
| if (GetParentNid() != -1) |
| { |
| int targetNid = GetParentNid(); |
| CLNode *targetLNode = Nodes->GetLNode( targetNid ); |
| // Send the process exit to the parent node |
| int rc = PtpClient->ProcessExit( this |
| , targetNid |
| , targetLNode->GetNode()->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process exit " |
| "for process %s (%d, %d) " |
| "to parent node %s, nid=%d\n" |
| , method_name |
| , GetName() |
| , GetNid() |
| , GetPid() |
| , targetLNode->GetNode()->GetName() |
| , targetLNode->GetNid() ); |
| mon_log_write(MON_PROCESS_PROCEXIT_2, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| procExitNotifierNodes(); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::GenerateEvent( int event_id, int length, char *data ) |
| { |
| struct message_def *msg; |
| |
| const char method_name[] = "CProcess::GenerateEvent"; |
| TRACE_ENTRY; |
| if( Clone ) |
| { |
| if ( Event_messages ) |
| { |
| // Replicate the event to other nodes |
| CReplEvent *repl = new CReplEvent(event_id, length, data, Nid, Pid, Verifier); |
| Replicator.addItem(repl); |
| } |
| } |
| else |
| { |
| if ( Event_messages ) |
| { |
| msg = new struct message_def; |
| msg->type = MsgType_Event; |
| msg->noreply = true; |
| msg->u.request.type = ReqType_Notice; |
| msg->u.request.u.event_notice.event_id = event_id; |
| msg->u.request.u.event_notice.length = length; |
| memset( msg->u.request.u.event_notice.data, 0, MAX_SYNC_DATA ); |
| if (length && data) |
| { |
| memmove( msg->u.request.u.event_notice.data, data, (length>MAX_SYNC_DATA)?MAX_SYNC_DATA:length ); |
| } |
| |
| SQ_theLocalIOToClient->putOnNoticeQueue( Pid |
| , Verifier |
| , msg |
| , NULL); |
| } |
| } |
| TRACE_EXIT; |
| } |
| #endif |
| |
| CProcess *CProcess::GetBackup (void) |
| { |
| CLNode *node = NULL; |
| CProcess *parent = NULL; |
| CProcess *backup = NULL; |
| |
| node = Nodes->GetLNode (Parent_Nid); |
| if (node) |
| { |
| parent = node->GetProcessL(Parent_Pid); |
| if (parent) |
| { |
| backup = (parent->Backup ? parent : NULL); |
| } |
| } |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("CProcess::GetBackup" "@%d" " - name= %s(%d:%d), parent=%p(%s), backup=%p" "\n", __LINE__, Name, Parent_Nid, Parent_Pid, parent, parent ? parent->Name : "None", backup); |
| |
| return backup; |
| } |
| |
| |
| // see: CProcessContainer::GetProcess (int pid) |
| // see: CProcessContainer::GetProcess (char *name, bool checkstate) |
| |
| CProcess *CProcess::GetProcessByType( PROCESSTYPE type ) |
| { |
| CProcess *entry = this; |
| |
| const char method_name[] = "CProcess::GetProcessByType"; |
| TRACE_ENTRY; |
| while (entry) |
| { |
| if (entry->Type == type) |
| { |
| // Only return entry if it has completed startup |
| if (entry->State_ != State_Up) |
| { |
| entry = NULL; |
| } |
| break; |
| } |
| entry = entry->next_; |
| } |
| TRACE_EXIT; |
| return entry; |
| } |
| |
| // see: CLNode::GetProcessL (int pid) |
| // see: CLNode::GetProcessL (char *name, bool checkstate) |
| |
| CProcess *CProcess::GetProcessLByType( PROCESSTYPE type ) |
| { |
| CProcess *entry = this; |
| |
| const char method_name[] = "CProcess::GetProcessLByType"; |
| TRACE_ENTRY; |
| while (entry) |
| { |
| if (entry->Type == type) |
| { |
| // Only return entry if it has completed startup |
| if (entry->State_ != State_Up) |
| { |
| entry = NULL; |
| } |
| break; |
| } |
| entry = entry->nextL_; |
| } |
| TRACE_EXIT; |
| return entry; |
| } |
| |
| bool CProcess::MakePrimary (void) |
| { |
| bool successful; |
| CLNode *node = NULL; |
| CProcess *primary = NULL; |
| CProcess *backup = NULL; |
| |
| const char method_name[] = "CProcess::MakePrimary"; |
| TRACE_ENTRY; |
| if (Backup) |
| { |
| backup = this; |
| if (Parent_Nid != -1) |
| { |
| node = Nodes->GetLNode (Parent_Nid); |
| if (node) |
| { |
| if (Parent_Pid != -1) |
| { |
| primary = node->GetProcessL(Parent_Pid); |
| if (!primary) |
| { |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" " - Can't find Primary process" "\n", method_name, __LINE__); |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" " - Can't find Primary process's node" "\n", method_name, __LINE__); |
| } |
| } |
| } |
| else |
| { |
| primary = this; |
| if (Parent_Nid != -1) |
| { |
| node = Nodes->GetLNode (Parent_Nid); |
| if (node) |
| { |
| if (Parent_Pid != -1) |
| { |
| backup = node->GetProcessL(Parent_Pid); |
| if (backup) |
| { |
| backup = (backup->Backup ? backup : NULL); |
| } |
| } |
| else |
| { |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" " - Can't find Backup process" "\n", method_name, __LINE__); |
| } |
| } |
| else |
| { |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" " - Can't find Backup process's node" "\n", method_name, __LINE__); |
| } |
| } |
| } |
| |
| if (primary == this) |
| { |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" "- Primary process will continue as Primary" "\n", method_name, __LINE__); |
| if (!backup) |
| { |
| primary->Parent_Nid = -1; |
| primary->Parent_Pid = -1; |
| } |
| successful = true; |
| } |
| else if (backup == this) |
| { |
| backup->Backup = false; |
| if (primary) |
| { |
| primary->Backup = true; |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" "- Old Primary process is now the Backup" "\n", method_name, __LINE__); |
| } |
| else |
| { |
| backup->Parent_Nid = -1; |
| backup->Parent_Pid = -1; |
| } |
| if (trace_settings & TRACE_REQUEST_DETAIL) |
| trace_printf("%s@%d" "- Backup process is now the Primary" "\n", method_name, __LINE__); |
| successful = true; |
| } |
| else |
| { |
| successful = false; |
| } |
| |
| TRACE_EXIT; |
| return successful; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcess::MyTransactions( struct message_def *msg ) |
| { |
| int idx; |
| CNotice *notice = NoticeHead; |
| |
| const char method_name[] = "CProcess::MyTransactions"; |
| TRACE_ENTRY; |
| |
| while (notice) |
| { |
| if ( !isNull( notice->TransID ) ) |
| { |
| idx = msg->u.reply.u.trans_info.num_processes; |
| msg->u.reply.u.trans_info.procs[idx].nid = notice->Nid; |
| msg->u.reply.u.trans_info.procs[idx].pid = notice->Pid; |
| msg->u.reply.u.trans_info.procs[idx].trans_id = notice->TransID; |
| msg->u.reply.u.trans_info.num_processes++; |
| if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST) |
| { |
| msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE; |
| return FAILURE; |
| } |
| } |
| notice = notice->GetNext(); |
| } |
| |
| TRACE_EXIT; |
| return SUCCESS; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcess::Open (CProcess * opened_process, int death_notification) |
| { |
| const char method_name[] = "CProcess::Open"; |
| TRACE_ENTRY; |
| |
| bool status; |
| |
| if ((opened_process->StartupCompleted) && |
| (opened_process->State_ == State_Up) && (State_ == State_Up)) |
| { |
| if ( death_notification |
| && !((opened_process->Parent_Nid == Nid) && |
| (opened_process->Parent_Pid == Pid)) ) |
| { |
| _TM_Txid_External transid; |
| transid = null_trans(); |
| opened_process->RegisterDeathNotification( Nid |
| , Pid |
| , Verifier |
| , Name |
| , transid); |
| } |
| status = SUCCESS; |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[CProcess::Open], Can't Open Process %s " |
| "has not completed startup protocol!\n", opened_process->Name); |
| mon_log_write(MON_PROCESS_OPEN_1, SQ_LOG_ERR, buf); |
| |
| status = FAILURE; |
| } |
| TRACE_EXIT; |
| |
| return status; |
| } |
| #endif |
| |
| void CProcessContainer::close_fds ( void ) |
| { |
| DIR *dirp = opendir("/proc/self/fd"); |
| for (;;) |
| { |
| if (dirp == NULL) |
| break; |
| struct dirent *direntp = readdir(dirp); |
| if (direntp == NULL) |
| break; |
| if (direntp->d_ino == 0) // invalid inode-number |
| continue; |
| if (direntp->d_name[0] == '.') // relative |
| continue; |
| int fd; |
| sscanf(direntp->d_name, "%d", &fd); |
| if (fd > 2) |
| close(fd); |
| } |
| if (dirp != NULL) |
| closedir(dirp); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| CNotice *CProcess::RegisterDeathNotification( int nid |
| , int pid |
| , Verifier_t verifier |
| , const char *name |
| , _TM_Txid_External trans_id ) |
| { |
| CNotice *notice = NULL; |
| |
| const char method_name[] = "CProcess::RegisterDeathNotification"; |
| TRACE_ENTRY; |
| |
| deathInterestLock_.lock(); |
| |
| if ( NoticeHead ) |
| { |
| notice = NoticeHead->GetNotice( nid, pid, verifier, trans_id ); |
| } |
| if ( notice == NULL ) |
| { |
| notice = new CNotice (nid, pid, verifier, name, trans_id, this); |
| if (NoticeHead == NULL) |
| { |
| NoticeHead = NoticeTail = notice; |
| } |
| else |
| { |
| NoticeTail = NoticeTail->Link (notice); |
| } |
| } |
| else |
| { |
| // We have a duplicate registation request for notification. |
| // Just return original notice object without error. |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST)) |
| trace_printf("%s@%d" " - Already have registered for this notice" "\n", method_name, __LINE__); |
| } |
| |
| deathInterestLock_.unlock(); |
| |
| TRACE_EXIT; |
| return notice; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::ReplyNewProcess (struct message_def * reply_msg, |
| CProcess * process, int result) |
| { |
| const char method_name[] = "CProcess::ReplyNewProcess"; |
| TRACE_ENTRY; |
| |
| // the parent gets a new_process reply |
| reply_msg->type = MsgType_Service; |
| reply_msg->noreply = false; |
| reply_msg->reply_tag = process->ReplyTag; |
| reply_msg->u.reply.type = ReplyType_NewProcess; |
| reply_msg->u.reply.u.new_process.nid = process->Nid; |
| reply_msg->u.reply.u.new_process.pid = process->Pid; |
| reply_msg->u.reply.u.new_process.verifier = process->Verifier; |
| strcpy (reply_msg->u.reply.u.new_process.process_name,process->Name); |
| reply_msg->u.reply.u.new_process.return_code = result; |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS | TRACE_SYNC)) |
| trace_printf("%s@%d - Created process %s (%d, %d:%d), sending reply to " |
| "%s (%d, %d), result=%d\n", method_name, __LINE__, |
| process->Name, process->Nid, process->Pid, process->Verifier, |
| Name, Nid, Pid, result); |
| |
| // send reply to the parent |
| SQ_theLocalIOToClient->sendCtlMsg |
| ( Pid, MC_SReady, ((SharedMsgDef*)reply_msg)-> trailer.index ); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcess::SendProcessCreatedNotice(CProcess *parent, int result) |
| { |
| const char method_name[] = "CProcess::SendProcessCreatedNotice"; |
| TRACE_ENTRY; |
| |
| struct message_def *reply_msg; |
| |
| reply_msg = new struct message_def; |
| |
| // the parent gets a child started notice |
| reply_msg->type = MsgType_ProcessCreated; |
| reply_msg->noreply = true; |
| reply_msg->u.request.type = ReqType_Notice; |
| reply_msg->u.request.u.process_created.nid = Nid; |
| reply_msg->u.request.u.process_created.pid = Pid; |
| reply_msg->u.request.u.process_created.verifier = Verifier; |
| reply_msg->u.request.u.process_created.tag = Tag; |
| strcpy(reply_msg->u.request.u.process_created.port, Port); |
| strcpy(reply_msg->u.request.u.process_created.process_name, Name); |
| reply_msg->u.request.u.process_created.return_code = result; |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS | TRACE_SYNC)) |
| trace_printf("%s@%d - Created process %s (%d, %d), sending process " |
| "created notice to %s (%d, %d), result=%d\n", |
| method_name, __LINE__, Name, Nid, Pid, |
| parent->Name, parent->Nid, parent->Pid, result); |
| |
| // send notice to the parent |
| SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid |
| , parent->Verifier |
| , reply_msg |
| , NULL); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| struct message_def * CProcess::GetDeathNotice( void ) |
| { |
| const char method_name[] = "CProcess::GetDeathNotice"; |
| TRACE_ENTRY; |
| |
| struct message_def *notice = NULL; |
| |
| ssmpNoticesLock_.lock(); |
| if ( ! ssmpNotices_.empty() ) |
| { |
| notice = ssmpNotices_.front(); |
| if ( notice ) |
| { |
| ssmpNotices_.pop_front(); |
| } |
| } |
| ssmpNoticesLock_.unlock(); |
| |
| TRACE_EXIT; |
| |
| return notice; |
| } |
| |
| void CProcess::PutDeathNotice( struct message_def * notice) |
| { |
| const char method_name[] = "CProcess::PutDeathNotice"; |
| TRACE_ENTRY; |
| |
| ssmpNoticesLock_.lock(); |
| ssmpNotices_.push_front ( notice ); |
| ssmpNoticesLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcess::Switch( CProcess *parent ) |
| { |
| const char method_name[] = "CProcess::Switch"; |
| TRACE_ENTRY; |
| |
| if (parent) |
| { |
| if (IsBackup()) |
| { |
| if (GetPid() == parent->GetParentPid()) |
| { |
| // The parent now doesn't have a backup |
| parent->SetParentNid ( -1 ); |
| parent->SetParentPid ( -1 ); |
| parent->SetParent ( NULL ); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Parent not our primary" "\n", method_name, __LINE__); |
| } |
| } |
| if (parent->IsBackup()) |
| { |
| if (GetPid() == parent->GetParentPid()) |
| { |
| // The parent is now the primary |
| parent->SetBackup ( false ); |
| parent->SetParentNid ( -1 ); |
| parent->SetParentPid ( -1 ); |
| parent->SetParent ( NULL ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Backup taking over, Name=" "%s" "\n", method_name, __LINE__, parent->GetName()); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Parent not our backup" "\n", method_name, __LINE__); |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| |
| CProcessContainer::CProcessContainer (void) |
| :numProcs_(0) |
| ,nodeContainer_(false) |
| ,processNameFormatLong_(true) |
| ,nameMap_(NULL) |
| ,pidMap_(NULL) |
| ,head_(NULL) |
| ,tail_(NULL) |
| { |
| const char method_name[] = "CProcessContainer::CProcessContainer"; |
| TRACE_ENTRY; |
| |
| // Add eyecatcher sequence as a debugging aid |
| memcpy(&eyecatcher_, "PCTR", 4); |
| |
| //create & initialize existing semaphore |
| char sem_name[MAX_PROCESS_PATH]; |
| snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER")); |
| Mutex = sem_open(sem_name,O_CREAT,0644,0); |
| if(Mutex == SEM_FAILED) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), "[%s], Can't create semaphore %s!\n", |
| method_name, sem_name); |
| mon_log_write(MON_PROCESSCONT_PROCESSCONT_1, SQ_LOG_ERR, buf); |
| |
| sem_unlink(sem_name); |
| abort(); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| char *env = getenv("SQ_MON_PROCESS_NAME_FORMAT_LONG"); |
| if ( env && isdigit(*env) ) |
| { |
| int val = atoi(env); |
| processNameFormatLong_ = (val != 0) ? true : false; |
| } |
| #endif |
| |
| TRACE_EXIT; |
| } |
| |
| CProcessContainer::CProcessContainer( bool nodeContainer ) |
| :numProcs_(0) |
| ,nodeContainer_(nodeContainer) |
| ,processNameFormatLong_(true) |
| ,nameMap_(NULL) |
| ,pidMap_(NULL) |
| ,head_(NULL) |
| ,tail_(NULL) |
| { |
| const char method_name[] = "CProcessContainer::CProcessContainer"; |
| TRACE_ENTRY; |
| |
| // Add eyecatcher sequence as a debugging aid |
| memcpy(&eyecatcher_, "PCTR", 4); |
| |
| //create & initialize existing semaphore |
| char sem_name[MAX_PROCESS_PATH]; |
| snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER")); |
| Mutex = sem_open(sem_name,O_CREAT,0644,0); |
| if(Mutex == SEM_FAILED) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| int err = errno; |
| snprintf(buf, sizeof(buf), "[%s], Can't create semaphore %s! (%s)\n", |
| method_name, sem_name, strerror(err)); |
| mon_log_write(MON_PROCESSCONT_PROCESSCONT_3, SQ_LOG_ERR, buf); |
| |
| err = sem_unlink(sem_name); |
| if (err == -1) |
| { |
| int err = errno; |
| snprintf(buf, sizeof(buf), "[%s], Can't unlink semaphore %s! (%s)\n", |
| method_name, sem_name, strerror(err)); |
| mon_log_write(MON_PROCESSCONT_PROCESSCONT_4, SQ_LOG_ERR, buf); |
| } |
| abort(); |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| char *env = getenv("SQ_MON_PROCESS_NAME_FORMAT_LONG"); |
| if ( env && isdigit(*env) ) |
| { |
| int val = atoi(env); |
| processNameFormatLong_ = (val != 0) ? true : false; |
| } |
| #endif |
| |
| if ( nodeContainer_ ) |
| { |
| nameMap_ = new nameMap_t; |
| pidMap_ = new pidMap_t; |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| CProcessContainer::~CProcessContainer (void) |
| { |
| const char method_name[] = "CProcessContainer::~CProcessContainer"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| CleanUpProcesses(); |
| if ( nameMap_ ) |
| { |
| delete nameMap_; |
| } |
| if ( pidMap_ ) |
| { |
| delete pidMap_; |
| } |
| } |
| |
| sem_close(Mutex); |
| char sem_name[MAX_PROCESS_PATH]; |
| snprintf(sem_name,sizeof(sem_name), "/monitor.sem.%s", getenv("USER")); |
| sem_unlink(sem_name); |
| |
| // Alter eyecatcher sequence as a debugging aid to identify deleted object |
| memcpy(&eyecatcher_, "pctr", 4); |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::AddToPidMap(int pid, CProcess *process) |
| { |
| const char method_name[] = "CProcessContainer::AddToPidMap"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| pair<pidMap_t::iterator, bool> ret; |
| |
| if (pid != -1) |
| { |
| // temp trace, remove once USE_PROCESS_MAPS is default |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d inserting into pidMap %p: %d, %s (%d, %d)\n" |
| , method_name, __LINE__ |
| , pidMap_, pid |
| , process->GetName(), process->GetNid(), process->GetPid()); |
| } |
| |
| pidMapLock_.lock(); |
| ret = pidMap_->insert( pidMap_t::value_type ( pid, process )); |
| pidMapLock_.unlock(); |
| if (ret.second == false) |
| { // Already had an entry with the given key value |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d pid map already contained process %d\n", |
| method_name, __LINE__, pid); |
| } |
| } |
| |
| // temp trace |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d pidMap_ (%p) now has %d entries\n", |
| method_name, __LINE__, pidMap_, (int)pidMap_->size()); |
| } |
| |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::DelFromPidMap( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::DelFromPidMap"; |
| TRACE_ENTRY; |
| |
| pidMapLock_.lock(); |
| int count = pidMap_->erase ( process->GetPid() ); |
| pidMapLock_.unlock(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| if (count != 0) |
| { |
| trace_printf("%s@%d removed from pidMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, pidMap_, |
| process->GetName(), process->GetNid(), process->GetPid()); |
| } |
| } |
| |
| if ( process->GetPid() != process->GetPidAtFork() ) |
| { // Process id changed after fork(). [This could happen if, for |
| // example, a shell script was the originally started process |
| // and it then started the actual process. |
| pidMapLock_.lock(); |
| int count = pidMap_->erase ( process->GetPidAtFork() ); |
| pidMapLock_.unlock(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| if (count != 0) |
| { |
| trace_printf("%s@%d removed from pidMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, pidMap_, |
| process->GetName(), process->GetNid(), |
| process->GetPidAtFork()); |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::AddToNameMap( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::AddToNameMap"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| pair<nameMap_t::iterator, bool> ret1; |
| |
| if ( strlen(process->GetName()) != 0 ) |
| { |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d inserting into nameMap %p: %s (%d, %d)\n", method_name, __LINE__, nameMap_, process->GetName(), process->GetNid(), process->GetPid()); |
| } |
| |
| nameMapLock_.lock(); |
| ret1 = nameMap_->insert( nameMap_t::value_type ( process->GetName(), |
| process )); |
| nameMapLock_.unlock(); |
| if (ret1.second == false) |
| { // Already had an entry with the given key value. This is not |
| // necessarily an error. One sceario where this can happen is |
| // if a new process request contains a user assigned process |
| // name and the process is to be created on another node. |
| // When the InternalType_ProcInit replication message is |
| // processed on the originating node we'll attempt to re-add |
| // the name (a system generated name will be added for the first |
| // time at this point.) |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d nameMap %p already contained process %s\n", |
| method_name, __LINE__, nameMap_, process->GetName()); |
| } |
| } |
| |
| // temp trace |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d nameMap_ (%p) now has %d entries\n", |
| method_name, __LINE__, nameMap_, |
| (int)nameMap_->size()); |
| } |
| |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::DelFromNameMap( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::DelFromNameMap"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| CProcess *p2 = GetProcess ( process->GetName(), false ); |
| if ( p2 == NULL) |
| { // Process was not in the map, no need to erase |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d not removing from nameMap %p: %s (%d, %d)." |
| " No such mapping\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), process->GetNid(), process->GetPid()); |
| } |
| } |
| else if (p2 != process) |
| { |
| // Name was in map but process object is not what we were expecting |
| // so leave it alone |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d not removing from nameMap %p: %s (%d, %d)." |
| " Map contains %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), process->GetNid(), process->GetPid(), |
| p2->GetName(), p2->GetNid(), p2->GetPid()); |
| } |
| } |
| else |
| { |
| |
| nameMapLock_.lock(); |
| int count = nameMap_->erase ( process->GetName() ); |
| nameMapLock_.unlock(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| if (count != 0) |
| { |
| trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), process->GetNid(), process->GetPid()); |
| } |
| } |
| } |
| |
| } |
| |
| void CProcessContainer::AddToList(CProcess *process) |
| { |
| const char method_name[] = "CProcessContainer::AddToList"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| if (process) |
| { |
| // link it to the CNode container |
| if (head_ == NULL) |
| { |
| head_ = tail_ = process; |
| process->prev_ = NULL; |
| } |
| else |
| { |
| tail_->next_ = process; |
| process->prev_ = tail_; |
| tail_ = process; |
| } |
| process->next_ = NULL; |
| numProcs_++; |
| |
| // link it to the CLNode container |
| CLNode *lnode = Nodes->GetLNode( process->Nid ); |
| lnode->AddToListL( process ); |
| |
| if (trace_settings & (TRACE_PROCESS_DETAIL)) |
| { |
| CNode *node = lnode->GetNode(); |
| trace_printf("%s@%d" " container %p - pnid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, node->GetPNid(), numProcs_, nodeContainer_); |
| } |
| |
| AddToNameMap(process); |
| if ( process->Pid != -1 ) |
| { |
| AddToPidMap(process->Pid, process); |
| } |
| |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::AddToListL(CProcess *process) |
| { |
| const char method_name[] = "CProcessContainer::AddToListL"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CLNode (the logical node) |
| abort(); |
| } |
| |
| if (process) |
| { |
| // link it to the CLNode container |
| if (head_ == NULL) |
| { |
| head_ = tail_ = process; |
| process->prevL_ = NULL; |
| } |
| else |
| { |
| tail_->nextL_ = process; |
| process->prevL_ = tail_; |
| tail_ = process; |
| } |
| process->nextL_ = NULL; |
| numProcs_++; |
| |
| if (trace_settings & (TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d" " - container %p nid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, process->Nid, numProcs_, nodeContainer_); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::AttachProcessCheck ( struct message_def *msg ) |
| { |
| CProcess *process; |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcessContainer::AttachProcessCheck"; |
| TRACE_ENTRY; |
| |
| assert ( msg != NULL); |
| |
| if ( msg->u.request.u.startup.startup_size != sizeof(msg->u.request.u.startup) ) |
| { |
| snprintf(la_buf, sizeof(la_buf), "[%s], Startup message from %s has invalid size=%d, expecting size=%d\n", |
| method_name, msg->u.request.u.startup.process_name, |
| msg->u.request.u.startup.startup_size, |
| (int) sizeof(msg->u.request.u.startup)); |
| mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_1, SQ_LOG_ERR, la_buf); |
| |
| abort(); // TODO: revisit |
| } else if ((MyNode->GetState() != State_Up && |
| MyNode->GetState() != State_Shutdown) && |
| ( strcmp(msg->u.request.u.startup.program,"shell")!=0 ) ) |
| { |
| // Check if we can accept a connection |
| snprintf(la_buf, sizeof(la_buf), "[%s], Can't accept %s because node is logically down\n", method_name, msg->u.request.u.startup.process_name); |
| mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_1, SQ_LOG_ERR, la_buf); |
| |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = -1; |
| msg->u.reply.u.generic.pid = -1; |
| msg->u.reply.u.generic.verifier = -1; |
| msg->u.reply.u.generic.process_name[0] = '\0'; |
| msg->u.reply.u.generic.return_code = MPI_ERR_OP; |
| } |
| |
| // shell is trying to attach across all nodes |
| else if (msg->u.request.u.startup.paired) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_INIT | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - paired attach" "\n", method_name, __LINE__); |
| Nodes->GetLNode (msg->u.request.u.startup.process_name, &process); |
| if (process) |
| { |
| process->SetPaired ( true ); |
| process->SetClone( false ); |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = process->GetNid(); |
| msg->u.reply.u.generic.pid = process->GetPid(); |
| msg->u.reply.u.generic.verifier = process->GetVerifier(); |
| strcpy (msg->u.reply.u.generic.process_name, process->GetName()); |
| msg->u.reply.u.generic.return_code = MPI_SUCCESS; |
| } |
| else |
| { |
| // Can't find process |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Can't find or clone Process %s to pair attach!\n", |
| method_name, msg->u.request.u.startup.process_name); |
| mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_2, SQ_LOG_ERR, la_buf); |
| |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = -1; |
| msg->u.reply.u.generic.pid = -1; |
| msg->u.reply.u.generic.verifier = -1; |
| msg->u.reply.u.generic.process_name[0] = '\0'; |
| msg->u.reply.u.generic.return_code = MPI_ERR_NAME; |
| } |
| } |
| // check if its an attach request, if so setup the process |
| else if ((msg->u.request.u.startup.nid == -1) && |
| (msg->u.request.u.startup.pid == -1) ) |
| { |
| Nodes->GetLNode (msg->u.request.u.startup.process_name, &process); |
| if (!process) |
| { |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - process attaching" "\n", method_name, __LINE__); |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from MyNode (the local physical node) |
| abort(); |
| } |
| if ( ! MyNode->IsSpareNode() ) |
| { |
| int nid = MyNode->AssignNid(); |
| if ( (nid == -1) && (MyNode->GetState() != State_Up) ) |
| { |
| snprintf( la_buf, sizeof(la_buf), |
| "[%s], Can't attach the pid %d (program: %s) - the monitor is not up yet (curr state: %d).\n", |
| method_name, |
| msg->u.request.u.startup.os_pid, |
| msg->u.request.u.startup.program, |
| MyNode->GetState() ); |
| mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf ); |
| |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = -1; |
| msg->u.reply.u.generic.pid = -1; |
| msg->u.reply.u.generic.verifier = -1; |
| msg->u.reply.u.generic.process_name[0] = '\0'; |
| msg->u.reply.u.generic.return_code = MPI_ERR_NAME; |
| } |
| else |
| { |
| strId_t progStrId = MyNode->GetStringId( msg->u.request.u.startup.program ); |
| strId_t nullStrId = { -1, -1 }; |
| process = |
| new CProcess( NULL, nid, msg->u.request.u.startup.os_pid, ProcessType_Generic, 0, 0, false, true, (char *) "", |
| nullStrId, nullStrId, progStrId, (char *) "", (char *) "" ); |
| if ( process == NULL ) |
| { |
| //TODO: Log event |
| abort(); |
| } |
| if ( process ) |
| { |
| char user_argv[MAX_ARGS][MAX_ARG_SIZE]; |
| process->userArgs( 0, user_argv ); |
| } |
| if ( msg->u.request.u.startup.process_name[0] == '\0' ) |
| { // Create a name for the process and place it in the |
| // Name member of the process object); |
| char pname[MAX_KEY_NAME]; |
| MyNode->BuildOurName( nid, process->GetPid( ), pname ); |
| process->SetName( pname ); |
| } |
| else |
| { |
| process->SetName( |
| MyNode->NormalizeName( msg->u.request.u.startup.process_name ) ); |
| } |
| process->SetAttached( true ); |
| process->SetupFifo( process->GetNid( ), msg->u.request.u.startup.os_pid ); |
| process->SetCreationTime( msg->u.request.u.startup.os_pid ); |
| process->SetVerifier( ); // CProcessContainer::AttachProcessCheck |
| AddToList( process ); |
| process->CompleteProcessStartup( msg->u.request.u.startup.port_name, // CProcessContainer::AttachProcessCheck |
| msg->u.request.u.startup.os_pid, |
| msg->u.request.u.startup.event_messages, |
| msg->u.request.u.startup.system_messages, |
| false, |
| NULL, |
| MyPNID ); |
| |
| msg->u.reply.type = ReplyType_Startup; |
| msg->u.reply.u.startup_info.nid = process->GetNid( ); |
| msg->u.reply.u.startup_info.pid = process->GetPid( ); |
| msg->u.reply.u.startup_info.verifier = process->GetVerifier( ); |
| strcpy( msg->u.reply.u.startup_info.process_name, process->GetName( ) ); |
| msg->u.reply.u.startup_info.return_code = MPI_SUCCESS; |
| STRCPY( msg->u.reply.u.startup_info.fifo_stdin, |
| process->fifo_stdin() ); |
| STRCPY( msg->u.reply.u.startup_info.fifo_stdout, |
| process->fifo_stdout() ); |
| STRCPY( msg->u.reply.u.startup_info.fifo_stderr, |
| process->fifo_stderr() ); |
| |
| Monitor->writeProcessMapBegin( process->GetName( ) |
| , process->GetNid( ) |
| , process->GetPid( ) |
| , process->GetVerifier( ) |
| , -1, -1, -1 |
| , msg->u.request.u.startup.program ); |
| } |
| } |
| else |
| { |
| snprintf( la_buf, sizeof(la_buf), |
| "[%s], Can't attach, node is a spare node!\n", |
| method_name ); |
| mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_3, SQ_LOG_ERR, la_buf ); |
| |
| msg->u.reply.type = ReplyType_Startup; |
| msg->u.reply.u.startup_info.nid = -1; |
| msg->u.reply.u.startup_info.pid = -1; |
| msg->u.reply.u.startup_info.verifier = -1; |
| msg->u.reply.u.startup_info.process_name[0] = '\0'; |
| msg->u.reply.u.startup_info.return_code = MPI_ERR_NO_MEM; |
| } |
| } |
| else |
| { |
| // Find the duplicate process |
| snprintf( la_buf, sizeof(la_buf), |
| "[%s], Can't attach duplicate process %s!\n", |
| method_name, msg->u.request.u.startup.process_name ); |
| mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf ); |
| |
| msg->u.reply.type = ReplyType_Generic; |
| msg->u.reply.u.generic.nid = -1; |
| msg->u.reply.u.generic.pid = -1; |
| msg->u.reply.u.generic.verifier = -1; |
| msg->u.reply.u.generic.process_name[0] = '\0'; |
| msg->u.reply.u.generic.return_code = MPI_ERR_NAME; |
| } |
| } |
| // complete a monitor child process startup |
| else |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_SYNC | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - child attach" "\n", method_name, __LINE__); |
| Monitor->CompleteProcessStartup(msg); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::Bcast (struct message_def *msg) |
| { |
| CProcess *process = NULL; |
| SharedMsgDef *shm = NULL; |
| SQ_LocalIOToClient::bcastPids_t *bcastPids = NULL; |
| unsigned int msgSize; |
| |
| const char method_name[] = "CProcessContainer::Bcast"; |
| TRACE_ENTRY; |
| |
| // Prepare a broadcast notice for sending by the local io "pending |
| // notice thread". Do this by formatting an image of the message |
| // to be sent along with a the list of process ids that will receive |
| // the notice. |
| pidMapLock_.lock(); |
| pidMap_t::iterator pidMapIt; |
| for ( pidMapIt = pidMap_->begin(); pidMapIt != pidMap_->end() ; pidMapIt++ ) |
| { |
| process = pidMapIt->second; |
| assert( process ); |
| if (process->IsSystemMessages() && |
| process->GetState() == State_Up) |
| { |
| if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_RECOVERY | TRACE_SYNC_DETAIL | TRACE_TMSYNC | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Send notice to %s (%d, %d:%d)\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->GetNid() |
| , process->GetPid() |
| , process->GetVerifier() ); |
| |
| if (!shm) |
| { // First process, allocate a buffer for the notice image |
| // and initialize it. |
| shm = new SharedMsgDef; |
| memset( &shm->trailer, 0, sizeof(shm->trailer) ); |
| bcastPids = new SQ_LocalIOToClient::bcastPids_t; |
| assert(bcastPids); |
| |
| msgSize = SQ_theLocalIOToClient->getSizeOfMsg( msg ); |
| |
| if ( msgSize > sizeof ( message_def ) ) |
| { // Not expected to occur but guard against client |
| // buffer overrun |
| msgSize = sizeof ( message_def ); |
| } |
| |
| memcpy( &shm->msg, msg, msgSize ); |
| shm->trailer.OSPid = BCAST_PID; |
| shm->trailer.verifier = -1; |
| } |
| // Add this process id to the list. |
| SQ_LocalIOToClient::pidVerifier_t pv; |
| pv.pv.pid = process->GetPid(); |
| pv.pv.verifier = process->GetVerifier(); |
| bcastPids->insert( pv.pnv ); |
| } |
| } |
| pidMapLock_.unlock(); |
| |
| if (shm) |
| { |
| SQ_theLocalIOToClient->putOnNoticeQueue( BCAST_PID |
| , -1 |
| , &shm->msg |
| , bcastPids); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| char *CProcessContainer::BuildOurName( int nid, int pid, char *name ) |
| { |
| const char method_name[] = "CProcessContainer::BuildOurName"; |
| TRACE_ENTRY; |
| |
| int i; |
| int rem; |
| int cnt[6]; |
| |
| if (!processNameFormatLong_) |
| { |
| // Convert Pid into base 35 acsii |
| cnt[0] = pid / 42875; // (35 * 35 * 35) |
| rem = pid - ( cnt[0] * 42875 ); |
| cnt[1] = rem / 1225; // (35 * 35) |
| rem -= ( cnt[1] * 1225 ); |
| cnt[2] = rem / 35; |
| rem -= ( cnt[2] * 35 ); |
| cnt[3] = rem; |
| |
| // Process name format long: '$Zxxpppp' xx = nid, pppp = pid |
| |
| // Convert Nid into base 16 acsii |
| sprintf(name,"$Z%2.2X",nid); |
| |
| // Convert Pid into base 36 ascii |
| for(i=3; i>=0; i--) |
| { |
| if( cnt[i] < 10 ) |
| { |
| name[i+4] = '0'+cnt[i]; |
| } |
| else |
| { |
| cnt[i] -= 10; |
| // we are skipping cap 'o' because it looks like zero. |
| if( cnt[i] >= 14 ) |
| { |
| name[i+4] = 'P'+(cnt[i]-14); |
| } |
| else |
| { |
| name[i+4] = 'A'+cnt[i]; |
| } |
| } |
| } |
| name[8] = '\0'; |
| } |
| else |
| { |
| // We are skipping 'A', 'I', 'O', and 'U' to distinguish between zero |
| // and one digits, and for political correctness in generated names |
| char b32table[32] = {'0','1','2','3','4','5','6','7','8','9' |
| ,'B','C','D','E','F','G','H','J','K','L','M' |
| ,'N','P','Q','R','S','T','V','W','X','Y','Z' }; |
| |
| // Convert Pid into base 32 ascii |
| cnt[0] = pid / 33554432; // (32 * 32 * 32 * 32 * 32) |
| rem = pid - ( cnt[0] * 33554432 ); |
| cnt[1] = rem / 1048576; // (32 * 32 * 32 * 32) |
| rem -= ( cnt[1] * 1048576 ); |
| cnt[2] = rem / 32768; // (32 * 32 * 32) |
| rem -= ( cnt[2] * 32768 ); |
| cnt[3] = rem / 1024; // (32 * 32) |
| rem -= ( cnt[3] * 1024 ); |
| cnt[4] = rem / 32; |
| rem -= ( cnt[4] * 32 ); |
| cnt[5] = rem; |
| |
| // Process name format long: '$Zxxxxpppppp' xxxx = nid, pppppp = pid |
| |
| // Convert Nid into base 16 ascii |
| sprintf(name,"$Z%4.4X",nid); |
| |
| // Convert Pid into base 32 ascii |
| for(i=5; i>=0; i--) |
| { |
| name[i+6] = static_cast<char>(b32table[cnt[i]]); |
| } |
| name[12] = '\0'; |
| } |
| |
| TRACE_EXIT; |
| return name; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcessContainer::CancelDeathNotification( int nid |
| , int pid |
| , int verifier |
| , _TM_Txid_External trans_id) |
| { |
| bool status = FAILURE; |
| CProcess *process = head_; |
| |
| // we will loop through all processes on the node ... return FAILURE |
| // only if we don't find any notices to cancel. |
| while (process) |
| { |
| status = process->CancelDeathNotification (nid, pid, verifier, trans_id); |
| process = process->GetNext (); |
| } |
| |
| return status; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| // Child_Exit terminates all child processes created by the parent process |
| // unless the child process is Unhooked from the parent process |
| void CProcessContainer::Child_Exit ( CProcess * parent ) |
| { |
| CProcess *process; |
| |
| const char method_name[] = "CProcessContainer::Child_Exit"; |
| TRACE_ENTRY; |
| if (trace_settings & TRACE_ENTRY_EXIT) |
| trace_printf("%s@%d with parent (%d, %d)\n", method_name, __LINE__, parent->GetNid(), parent->GetPid() ); |
| |
| if ( parent && |
| ((MyNode->GetState() != State_Shutdown && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined) |
| || (parent->GetType() == ProcessType_SPX) ) ) |
| { |
| CProcess::nidPid_t child; |
| CLNode * childLNode; |
| |
| while ( parent->childRemoveFirst ( child )) |
| { |
| |
| childLNode = Nodes->GetLNode( child.nid ); |
| process = (childLNode != NULL ) |
| ? childLNode->GetNode()->GetProcess( child.pid ) : NULL; |
| |
| if ( process && (!process->IsUnhooked()) ) |
| { |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Child process %s (%d, %d) exits due " |
| "to parent death (%d, %d)\n", |
| method_name, __LINE__, process->GetName(), |
| process->GetNid(), process->GetPid(), |
| parent->GetNid(), parent->GetPid()); |
| |
| childLNode->SetProcessState( process, State_Down, true ); |
| if ( !process->IsClone() ) |
| { |
| if ( parent->GetType() == ProcessType_SPX ) |
| { |
| kill (process->GetPid(), SIGKILL); |
| } |
| else |
| { |
| kill (process->GetPid(), Monitor->GetProcTermSig()); |
| } |
| } |
| else |
| { |
| if (NameServerEnabled) |
| { |
| CNode* childNode = childLNode->GetNode(); |
| // Forward the process kill to the target node |
| int rc = PtpClient->ProcessKill( process |
| , process->GetAbort() |
| , childLNode->GetNid() |
| , childNode->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process kill " |
| "request for child process %s (%d, %d) " |
| "to child node %s, nid=%d\n" |
| , method_name |
| , process->GetName() |
| , process->GetNid() |
| , process->GetPid() |
| , childNode->GetName() |
| , childLNode->GetNid() ); |
| mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf); |
| } |
| } |
| } |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Completed kill for child process %s (%d, %d)\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->GetNid() |
| , process->GetPid()); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| if (process) |
| { |
| trace_printf("%s@%d - Child process %s (%d, %d), not " |
| "killed, state=%d, unhooked=%d\n", |
| method_name, __LINE__, process->GetName(), |
| process->GetNid(), process->GetPid(), |
| process->GetState(), process->IsUnhooked()); |
| |
| } |
| } |
| |
| } |
| } |
| } |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::ChildUnHooked_Exit( CProcess* parent ) |
| { |
| const char method_name[] = "CProcessContainer::ChildUnHooked_Exit"; |
| TRACE_ENTRY; |
| |
| CProcess *process; |
| |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d with parent %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() ); |
| |
| if (NameServerEnabled) |
| { |
| if ( parent && !parent->IsClone() |
| && ((MyNode->GetState() != State_Shutdown |
| && MyNode->GetShutdownLevel() == ShutdownLevel_Undefined)) ) |
| { |
| CProcess::nidPid_t child; |
| CLNode* childLNode; |
| |
| while ( parent->childUnHookedRemoveFirst( child )) |
| { |
| childLNode = Nodes->GetLNode( child.nid ); |
| process = (childLNode != NULL ) |
| ? childLNode->GetNode()->GetProcess( child.pid ) : NULL; |
| if (process) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf( "%s@%d - Telling unhooked child process %s (%d,%d:%d) " |
| "of parent death %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->GetNid() |
| , process->GetPid() |
| , process->GetVerifier() |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() ); |
| } |
| |
| CNode* childNode = childLNode->GetNode(); |
| // Forward the parent's process exit to the child's node |
| int rc = PtpClient->ProcessExit( parent |
| , childLNode->GetNid() |
| , childNode->GetName() ); |
| if (rc) |
| { |
| char la_buf[MON_STRING_BUF_SIZE]; |
| snprintf( la_buf, sizeof(la_buf) |
| , "[%s] - Can't send process exit " |
| "request for parent process %s (%d,%d:%d) " |
| "to child's node %s, nid=%d\n" |
| , method_name |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() |
| , childNode->GetName() |
| , childLNode->GetNid() ); |
| mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf( "%s@%d - Completed kill for parent process %s (%d,%d:%d)\n" |
| , method_name, __LINE__ |
| , parent->GetName() |
| , parent->GetNid() |
| , parent->GetPid() |
| , parent->GetVerifier() ); |
| } |
| } |
| } |
| } |
| } |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CProcessContainer::CleanUpProcesses( void ) |
| { |
| CProcess *process = head_; |
| |
| const char method_name[] = "CProcessContainer::CleanUpProcesses"; |
| TRACE_ENTRY; |
| |
| while (process) |
| { |
| DelFromNameMap ( process ); |
| DelFromPidMap ( process ); |
| |
| DeleteFromList(process); |
| process = head_; |
| } |
| numProcs_ = 0; |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - process count=%d" "\n", method_name, __LINE__, numProcs_); |
| |
| TRACE_EXIT; |
| } |
| |
| CProcess *CProcessContainer::CloneProcess (int nid, |
| PROCESSTYPE type, |
| int priority, |
| int backup, |
| bool unhooked, |
| char *process_name, |
| char *port, |
| int os_pid, |
| int verifier, |
| int parent_nid, |
| int parent_pid, |
| int parent_verifier, |
| bool event_messages, |
| bool system_messages, |
| #ifdef NAMESERVER_PROCESS |
| char *path, |
| char *ldpath, |
| char *program, |
| #else |
| strId_t pathStrId, |
| strId_t ldpathStrId, |
| strId_t programStrId, |
| #endif |
| char *infile, |
| char *outfile, |
| struct timespec *creation_time, |
| int origPNidNs) |
| { |
| char pname[MAX_PROCESS_NAME]; |
| CProcess *process; |
| CProcess *parent = NULL; |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcessContainer::CloneProcess"; |
| TRACE_ENTRY; |
| |
| // load & normalize process name |
| if( process_name[0] == '\0' ) |
| { |
| pname[0] = '\0'; |
| } |
| else |
| { |
| STRCPY (pname, NormalizeName (process_name)); |
| } |
| |
| if (parent_nid != -1) |
| { |
| parent = Nodes->GetLNode (parent_nid)->GetProcessL(parent_pid); |
| } |
| |
| if (backup) |
| { |
| if (!parent) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Backup does not have parent's name.\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_CLONEPROCESS_1, SQ_LOG_ERR, la_buf); |
| return NULL; |
| } |
| if (parent_nid == nid) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Backup can't be in parent's node.\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_CLONEPROCESS_2, SQ_LOG_ERR, la_buf); |
| return NULL; |
| } |
| } |
| else |
| { |
| if (pname[0] != '\0') |
| { |
| Nodes->GetLNode (pname, &process); |
| if (process) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Duplicate processname (%s).\n", |
| method_name, process_name); |
| mon_log_write(MON_PROCESSCONT_CLONEPROCESS_3, SQ_LOG_ERR, la_buf); |
| return NULL; |
| } |
| } |
| } |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) |
| trace_printf("%s@%d - Process name=%s (%d, %d), port=%s, " |
| "parent (%d, %d)\n", method_name, |
| __LINE__, pname, nid, os_pid, port, parent_nid, parent_pid); |
| |
| #ifdef NAMESERVER_PROCESS |
| process = |
| new CProcess( parent |
| , nid |
| , os_pid |
| , verifier |
| , type |
| , priority |
| , backup |
| , false |
| , unhooked |
| , pname |
| , path |
| , ldpath |
| , program |
| , infile |
| , outfile); |
| #else |
| process = |
| new CProcess (parent, nid, os_pid, type, priority, backup, false, unhooked, pname, pathStrId, ldpathStrId, |
| programStrId, infile, outfile); |
| #endif |
| |
| if (process) |
| { |
| process->SetVerifier(verifier); // CProcessContainer::CloneProcess |
| process->SetParentVerifier(parent_verifier); |
| |
| AddToList( process ); |
| |
| process->CompleteProcessStartup (port, os_pid, event_messages, system_messages, os_pid==-1, creation_time, origPNidNs); // CProcessContainer::CloneProcess |
| } |
| |
| TRACE_EXIT; |
| return process; |
| } |
| |
| |
| CProcess *CProcessContainer::CompleteProcessStartup (char *process_name, |
| char *port, |
| int os_pid, |
| bool event_messages, |
| bool system_messages, |
| struct timespec *creation_time, |
| int origPNidNs) |
| { |
| CProcess *process = NULL; |
| |
| const char method_name[] = "CProcessContainer::CompleteProcessStartup"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| process = GetProcess(process_name,false); |
| } |
| else |
| { |
| // Not supposed to be able to get here. |
| abort(); |
| |
| } |
| if (process) |
| { |
| if (process->GetPid() != os_pid) |
| { // Process id changed from when we started the process. |
| #ifndef NAMESERVER_PROCESS |
| if ( !process->IsUnhooked() ) |
| { // Parent process object keeps track of child processes |
| // created on this node. Needed in case parent process |
| // exits abnormally. |
| int parentNid; |
| int parentPid; |
| if ( ! process->IsBackup() ) |
| { |
| parentNid = process->GetParentNid(); |
| parentPid = process->GetParentPid(); |
| } |
| else |
| { |
| parentNid = process->GetPairParentNid(); |
| parentPid = process->GetPairParentPid(); |
| } |
| |
| if ( parentNid != -1 && parentPid != -1 ) |
| { |
| CProcess* parent; |
| parent = Nodes->GetLNode ( parentNid ) |
| ->GetProcessL( parentPid ); |
| if ( parent && !process->IsBackup() ) |
| { |
| parent->childRemove ( process->GetNid(), |
| process->GetPid() ); |
| parent->childAdd ( process->GetNid(), os_pid ); |
| } |
| } |
| } |
| if (NameServerEnabled) |
| { |
| if (process->IsUnhooked()) |
| { // Parent process object keeps track of child processes |
| // created. Needed when parent process exits to clean up |
| // parent clone process object in remote nodes. |
| int parentNid; |
| int parentPid; |
| CProcess* parent; |
| if ( !process->IsBackup() ) |
| { |
| parentNid = process->GetParentNid(); |
| parentPid = process->GetParentPid(); |
| } |
| else |
| { |
| parentNid = process->GetPairParentNid(); |
| parentPid = process->GetPairParentPid(); |
| } |
| |
| if ( parentNid != -1 && parentPid != -1 ) |
| { |
| parent = Nodes->GetLNode(parentNid)->GetProcessL(parentPid); |
| if ( parent && !parent->IsClone() && !process->IsBackup() ) |
| { |
| parent->childUnHookedRemove( process->GetNid() |
| , process->GetPid() ); |
| parent->childUnHookedAdd( process->GetNid() |
| , os_pid ); |
| } |
| } |
| } |
| } |
| #endif |
| // Process id changed from when we started the process. So |
| // remap using the new pid. [This could happen if, for example, |
| // a shell script was the originally started process and it |
| // then started the process that is now sending its startup message] |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d - process id changed, new pid at process" |
| " startup=%d, original pid=%d\n", |
| method_name, __LINE__, os_pid, |
| process->GetPid() ); |
| } |
| AddToPidMap ( os_pid, process ); |
| } |
| process->CompleteProcessStartup (port, os_pid, event_messages, system_messages, false, creation_time, origPNidNs); // CProcessContainer::CompleteProcessStartup |
| } |
| // When using process maps do not log an error if the process is |
| // not found. This method can be called from |
| // CCluster::HandleOtherNodeMsg to check if process exists. |
| TRACE_EXIT; |
| return process; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| CProcess *CProcessContainer::CreateProcess (CProcess * parent, |
| int nid, |
| PROCESSTYPE type, |
| int debug, |
| int priority, |
| int backup, |
| bool unhooked, |
| char *process_name, |
| strId_t pathStrId, |
| strId_t ldpathStrId, |
| strId_t programStrId, |
| char *infile, |
| char *outfile, |
| void *tag, |
| int &result) |
| { |
| CProcess *process = NULL; |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcessContainer::CreateProcess"; |
| TRACE_ENTRY; |
| |
| result = MPI_SUCCESS; |
| |
| // load & normalize process name |
| if( process_name[0] != '\0' ) |
| { |
| NormalizeName (process_name); |
| } |
| |
| if (backup) |
| { |
| if ( !parent || (strcmp (parent->GetName(), process_name) != 0) ) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Backup does not have parent's name.\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_CREATEPROCESS_1, SQ_LOG_ERR, la_buf); |
| |
| result = MPI_ERR_NAME; |
| |
| return NULL; |
| } |
| if (parent->GetNid() == nid) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Backup can't be in parent's node.\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_CREATEPROCESS_2, SQ_LOG_ERR, la_buf); |
| |
| result = MPI_ERR_RANK; |
| |
| return NULL; |
| } |
| } |
| else |
| { |
| Nodes->GetLNode (process_name, &process, false); |
| if (process) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Duplicate processname (%s).\n", |
| method_name, process_name); |
| mon_log_write(MON_PROCESSCONT_CREATEPROCESS_3, SQ_LOG_ERR, la_buf); |
| |
| result = MPI_ERR_NAME; |
| return NULL; |
| } |
| } |
| |
| process = |
| new CProcess (parent, nid, -1, type, priority, backup, debug, unhooked, process_name, |
| pathStrId, ldpathStrId, programStrId, infile, outfile); |
| if (process) |
| { |
| AddToList( process ); |
| if (type == ProcessType_NameServer || |
| type == ProcessType_Watchdog || |
| type == ProcessType_PSD || |
| type == ProcessType_SMS ) |
| { |
| if (type == ProcessType_NameServer) |
| { |
| process->userArgs ( monitorArgc, monitorArgv ); |
| } |
| if (process->Create (parent, tag, result)) // monitor |
| { |
| AddToPidMap(process->GetPid(), process); |
| } |
| } |
| else if ( type == ProcessType_SSMP ) |
| { |
| Nodes->GetLNode ( nid )->SetSSMProc ( process ); |
| } |
| } |
| TRACE_EXIT; |
| |
| return process; |
| } |
| #endif |
| |
| #ifdef NAMESERVER_PROCESS |
| void CProcessContainer::DeleteAllDown() |
| { |
| CProcess *process = NULL; |
| int nid = -1; |
| int pid = -1; |
| |
| const char method_name[] = "CProcessContainer::DeleteAllDown"; |
| TRACE_ENTRY; |
| |
| nameMap_t::iterator nameMapIt; |
| |
| while ( true ) |
| { |
| nameMapLock_.lock(); |
| nameMapIt = nameMap_->begin(); |
| |
| if (nameMap_->size() == 0) |
| { |
| nameMapLock_.unlock(); |
| break; // all done |
| } |
| |
| process = nameMapIt->second; |
| |
| // Delete name map entry |
| nameMap_->erase (nameMapIt); |
| |
| nameMapLock_.unlock(); |
| |
| nid = process->GetNid(); |
| pid = process->GetPid(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), nid, pid); |
| } |
| |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf( "%s@%d - Completed delete for %s (%d, %d)\n" |
| , method_name, __LINE__ |
| , process->GetName(), nid, pid); |
| } |
| |
| // Remove all processes |
| // PSD will re-create persistent processes on spare node activation |
| Exit_Process( process, true, nid ); |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CProcessContainer::DeleteFromList( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::DeleteFromList"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| if (process) |
| { |
| RemoveFromList( process ); |
| |
| if (process->replRefCount() == 0) |
| { // Process object is not in replication queue so ok to |
| // delete. |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d - Deleting process %s (%d, %d)\n", method_name, __LINE__, process->Name, process->Nid, process->Pid ); |
| } |
| delete process; |
| } |
| else |
| { // Process object is in replication queue. Replication |
| // queueing logic will delete the object once the replication |
| // has completed. Set the state here to indicate that |
| // the object is no longer on the process list. |
| process->SetState (State_Unlinked); |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d - Setting process %s (%d, %d) state to State_Unlinked\n", method_name, __LINE__, process->Name, process->Nid, process->Pid ); |
| } |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::RemoveFromList( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::RemoveFromList"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| if (process) |
| { |
| CLNode *lnode = Nodes->GetLNode( process->Nid ); |
| lnode->RemoveFromListL( process ); |
| |
| if (head_ == process) |
| head_ = process->next_; |
| if (tail_ == process) |
| tail_ = process->prev_; |
| if (process->prev_) |
| process->prev_->next_ = process->next_; |
| if (process->next_) |
| process->next_->prev_ = process->prev_; |
| numProcs_--; |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| CNode *node = lnode->GetNode(); |
| trace_printf("%s@%d" " - container %p pnid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, node->GetPNid(), numProcs_, nodeContainer_); |
| } |
| |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| void CProcessContainer::RemoveFromListL( CProcess *process ) |
| { |
| const char method_name[] = "CProcessContainer::RemoveFromListL"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CLNode (the logical node) |
| abort(); |
| } |
| |
| if (process) |
| { |
| |
| if (head_ == process) |
| head_ = process->nextL_; |
| if (tail_ == process) |
| tail_ = process->prevL_; |
| if (process->prevL_) |
| process->prevL_->nextL_ = process->nextL_; |
| if (process->nextL_) |
| process->nextL_->prevL_ = process->prevL_; |
| numProcs_--; |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d" " - container %p nid=%d, process count=%d, pnode=%d" "\n", method_name, __LINE__, this, process->Nid, numProcs_, nodeContainer_); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcessContainer::Dump_Process (CProcess *dumper, CProcess *process, char *core_path) |
| { |
| bool status; |
| |
| const char method_name[] = "CProcessContainer::Dump_Process"; |
| TRACE_ENTRY; |
| |
| status = process->Dump(dumper, core_path); |
| |
| TRACE_EXIT; |
| return status; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::DumpCallback( int nid, pid_t pid, int status ) |
| { |
| const char method_name[] = "CProcessContainer::DumpCallback"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CLNode (the logical node) |
| abort(); |
| } |
| |
| CLNode *lnode = Nodes->GetLNode( nid ); |
| CNode *node = lnode->GetNode(); |
| |
| CProcess *process = node->GetProcess( pid ); |
| if ( process ) |
| { |
| if (WIFEXITED(status) && (WEXITSTATUS(status) == 0)) |
| { |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d - dump successful, nid=%d, pid=%d\n", |
| method_name, __LINE__, nid, pid ); |
| } |
| process->SetDumpStatus( Dump_Success ); |
| } |
| else |
| { |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d - dump failed, nid=%d, pid=%d\n", |
| method_name, __LINE__, nid, pid ); |
| } |
| process->SetDumpStatus( Dump_Failed ); |
| } |
| process->SetDumpState( Dump_Complete ); |
| |
| CReplDumpComplete *repl = new CReplDumpComplete( process ); |
| Replicator.addItem(repl); |
| } |
| else |
| { |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d - dump process not found, nid=%d, pid=%d\n", |
| method_name, __LINE__, nid, pid ); |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| CProcess * CProcessContainer::ParentNewProcReply ( CProcess *process, int result ) |
| { |
| const char method_name[] = "CProcessContainer::ParentNewProcReply"; |
| TRACE_ENTRY; |
| |
| CProcess *parent = NULL; |
| |
| if (process->GetParentNid() != -1) |
| { |
| parent = Nodes->GetProcess( process->GetParentNid(), |
| process->GetParentPid() ); |
| } |
| |
| // If we have a parent process then it is expecting a reply |
| if (parent && !parent->IsClone() && !parent->IsPaired()) |
| { |
| if (!process->IsNowait()) |
| { // The new process request was "waited" so send reply now |
| struct message_def *reply_msg; |
| reply_msg = process->parentContext(); |
| |
| if ( reply_msg ) |
| { |
| // send reply to the parent |
| parent->ReplyNewProcess ( reply_msg, process, result ); |
| // Since we have replied parent context (i.e the request |
| // buffer) is no longer valid. |
| process->parentContext( NULL ); |
| } |
| } |
| else |
| { // The new process request was "no-wait" so send notice now |
| process->SendProcessCreatedNotice(parent, result); |
| } |
| } |
| |
| TRACE_EXIT; |
| |
| return parent; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNode) |
| { |
| bool restarted = false; |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CProcess *parent = NULL; |
| |
| const char method_name[] = "CProcessContainer::Exit_Process(process)"; |
| TRACE_ENTRY; |
| |
| if (process) |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Process %s (abended=%d) is exiting, abend=%d, downNode=%d\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->IsAbended() |
| , abend |
| , downNode ); |
| |
| if ( process->GetState() == State_Down && abend && !process->IsAbended() ) |
| { |
| process->SetAbended( abend ); |
| } |
| if (process->GetNid() == downNode && !process->IsAbended() ) |
| { |
| process->SetAbended( abend ); |
| } |
| |
| if ( numProcs_ <= 0 ) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Node's process count is invalid, aborting\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf); |
| abort(); |
| } |
| |
| if ( process->GetState() == State_Stopped ) |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Process " "%s" " already exited." "\n", method_name, __LINE__, process->GetName()); |
| return; |
| } |
| |
| if (!process->IsStartupCompleted()) |
| { |
| parent = ParentNewProcReply ( process, MPI_ERR_SPAWN ); |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], Exiting process %s (%d, %d) did not complete " |
| "startup\n", |
| method_name, process->GetName(), process->GetNid(), |
| process->GetPid()); |
| mon_log_write(MON_PROCESSCONT_EXITPROCESS_2, SQ_LOG_ERR, buf); |
| } |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Process %s is exiting, persistent=%d, abended=%d\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->IsPersistent() |
| , process->IsAbended() ); |
| |
| if ( process->IsPersistent() && |
| (process->IsAbended() || process->GetType() == ProcessType_SPX)) |
| { |
| Child_Exit(process); |
| } |
| |
| if (!process->IsClone() && NameServerEnabled) |
| { |
| if (process->childUnHookedCount() > 0) |
| { |
| ChildUnHooked_Exit(process); |
| } |
| } |
| |
| if ( parent == NULL) |
| { |
| parent = Nodes->GetProcess( process->GetParentNid(), |
| process->GetParentPid() ); |
| } |
| |
| // Unregister any interest in other process' death |
| _TM_Txid_External transid; |
| transid = invalid_trans(); |
| process->procExitUnregAll( transid ); |
| |
| // Handle the process termination |
| process->Exit( parent ); |
| |
| process->Switch( parent ); // switch process pair roles if needed |
| |
| if ( process->IsPersistent() && |
| process->GetAbort() == false && |
| !MyNode->IsActivatingSpare() && |
| !MyNode->IsKillingNode() && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined && |
| (process->IsAbended()|| |
| process->GetNid() == downNode || |
| process->GetType() == ProcessType_SPX)) |
| { |
| // see if we can restart the process |
| restarted = RestartPersistentProcess( process, downNode ); |
| if ( !restarted ) |
| { |
| if (!process->IsClone() && !MyNode->isInQuiesceState()) |
| { |
| // Replicate the exit to other nodes |
| if (!NameServerEnabled) |
| { |
| // Replicate the exit to other nodes |
| CReplExit *repl = new CReplExit(process->GetNid(), |
| process->GetPid(), |
| process->GetVerifier(), |
| process->GetName(), |
| process->IsAbended()); |
| Replicator.addItem(repl); |
| } |
| } |
| else |
| { |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d - not queuing process exit for clone %s\n", method_name, __LINE__, process->GetName()); |
| } |
| } |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Persistent Process " "%s" " did not re-start on nid=" "%d" "\n", method_name, __LINE__, process->GetName(), process->GetNid()); |
| |
| CNode * node; |
| node = Nodes->GetLNode(process->GetNid())->GetNode(); |
| node->DeleteFromList( process ); |
| } |
| } |
| else |
| { |
| process->SetState (State_Stopped); |
| if ( !process->IsClone() && |
| (!MyNode->IsKillingNode() || MyNode->IsSoftNodeDown()) && |
| !MyNode->isInQuiesceState() && |
| !(process->GetType() == ProcessType_DTM && |
| process->IsAbended() && |
| MyNode->GetShutdownLevel() == ShutdownLevel_Undefined) ) |
| { |
| if (!NameServerEnabled) |
| { |
| // Replicate the exit to other nodes |
| CReplExit *repl = new CReplExit(process->GetNid(), |
| process->GetPid(), |
| process->GetVerifier(), |
| process->GetName(), |
| process->IsAbended()); |
| Replicator.addItem(repl); |
| } |
| } |
| else |
| { |
| if (trace_settings & TRACE_SYNC) |
| { |
| trace_printf("%s@%d - not queuing process exit for clone %s\n", method_name, __LINE__, process->GetName()); |
| } |
| } |
| process->SetDeletePending ( true ); |
| if (process->IsAbended() || process->GetType() == ProcessType_SPX) |
| { |
| Child_Exit(process); |
| } |
| |
| if (!process->IsClone() && process->GetType() == ProcessType_Watchdog) |
| { |
| HealthCheck.setState(HC_UPDATE_WATCHDOG, (long long)NULL); |
| } |
| CNode * node; |
| node = Nodes->GetLNode(process->GetNid())->GetNode(); |
| node->DeleteFromList( process ); |
| |
| } |
| } |
| TRACE_EXIT; |
| |
| return; |
| } |
| #endif |
| |
| #ifdef NAMESERVER_PROCESS |
| void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNode) |
| { |
| const char method_name[] = "CProcessContainer::Exit_Process(process)"; |
| TRACE_ENTRY; |
| |
| char la_buf[MON_STRING_BUF_SIZE]; |
| CProcess *parent = NULL; |
| |
| if (process) |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Process %s (abended=%d) is exiting, abend=%d, downNode=%d\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->IsAbended() |
| , abend |
| , downNode ); |
| |
| if ( process->GetState() == State_Down && abend && !process->IsAbended() ) |
| { |
| process->SetAbended( abend ); |
| } |
| if (process->GetNid() == downNode && !process->IsAbended() ) |
| { |
| process->SetAbended( abend ); |
| } |
| |
| if ( numProcs_ <= 0 ) |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Node's process count is invalid, aborting\n", |
| method_name); |
| mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf); |
| abort(); |
| } |
| |
| if ( process->GetState() == State_Stopped ) |
| { |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Process " "%s" " already exited." "\n", method_name, __LINE__, process->GetName()); |
| return; |
| } |
| |
| if ( parent == NULL) |
| { |
| parent = Nodes->GetProcess( process->GetParentNid(), |
| process->GetParentPid() ); |
| } |
| |
| // Handle the process termination |
| process->Switch( parent ); // switch process pair roles if needed |
| process->SetDeletePending ( true ); |
| |
| CNode *node; |
| node = Nodes->GetLNode(process->GetNid())->GetNode(); |
| node->DelFromNameMap ( process ); |
| node->DelFromPidMap ( process ); |
| node->DeleteFromList( process ); |
| } |
| TRACE_EXIT; |
| |
| return; |
| } |
| #endif |
| |
| CProcess *CProcessContainer::GetProcess (int pid) |
| { |
| const char method_name[] = "CProcessContainer::GetProcess (pid)"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| pidMap_t::iterator it; |
| CProcess *entry = NULL; |
| |
| pidMapLock_.lock(); |
| it = pidMap_->find(pid); |
| if (it != pidMap_->end()) |
| { |
| entry = it->second; |
| |
| // bugcatcher, temp call |
| entry->validateObj(); |
| } |
| pidMapLock_.unlock(); |
| |
| if (trace_settings & TRACE_PROCESS_DETAIL) |
| { |
| trace_printf("%s@%d - pidmap_ (%p) entry=%p, pid=%d, Name=%s\n", |
| method_name, __LINE__, pidMap_, entry, pid, |
| ((entry != NULL) ? entry->GetName(): "")); |
| } |
| |
| TRACE_EXIT; |
| return entry; |
| } |
| |
| CProcess *CProcessContainer::GetProcess (const char *name, bool checkstate) |
| { |
| const char method_name[] = "CProcessContainer::GetProcess (name)"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| nameMap_t::iterator it; |
| CProcess *entry = NULL; |
| |
| if ( ! strlen( name ) ) |
| { |
| TRACE_EXIT; |
| return entry; |
| } |
| char pname[MAX_PROCESS_NAME]; |
| strncpy(pname, name, MAX_PROCESS_NAME); |
| pname[MAX_PROCESS_NAME-1] = '\0'; |
| |
| NormalizeName (pname); |
| |
| // Look up name in process-name-to-process-object map. |
| nameMapLock_.lock(); |
| it = nameMap_->find( pname ); |
| |
| if (it != nameMap_->end()) |
| { |
| entry = it->second; |
| |
| // bugcatcher, temp call |
| entry->validateObj(); |
| |
| if (trace_settings & TRACE_PROCESS_DETAIL) |
| trace_printf("%s@%d - Name=%s, checkstate=%d, state=%d, backup=%d\n", |
| method_name, __LINE__, entry->GetName(), checkstate, |
| entry->GetState(), entry->IsBackup()); |
| |
| if ( checkstate && entry->GetState() != State_Up) |
| { // Only return entry if it has completed startup |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf( "%s@%d - Process %s (%d,%d:%d) not in 'Up' state" |
| ", checkstate=%d, state=%d, backup=%d\n" |
| , method_name, __LINE__ |
| , entry->GetName() |
| , entry->GetNid() |
| , entry->GetPid() |
| , entry->GetVerifier() |
| , checkstate |
| , entry->GetState() |
| , entry->IsBackup()); |
| entry = NULL; |
| } |
| } |
| nameMapLock_.unlock(); |
| |
| TRACE_EXIT; |
| |
| return entry; |
| } |
| |
| CProcess *CProcessContainer::GetProcess( int pid |
| , Verifier_t verifier |
| , bool checkstate ) |
| { |
| const char method_name[] = "CProcessContainer::GetProcess(pid, verifier)"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| CProcess *entry = NULL; |
| |
| if ( pid != -1 ) |
| { |
| entry = CProcessContainer::GetProcess( pid ); |
| } |
| |
| if ( entry ) |
| { |
| if ( (verifier != -1) && (verifier != entry->GetVerifier()) ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf( "%s@%d - Get (%d:%d) failed -- verifier mismatch (%d)\n" |
| , method_name, __LINE__ |
| , pid |
| , verifier |
| , entry->GetVerifier() ); |
| } |
| entry = NULL; |
| } |
| } |
| |
| if ( entry && checkstate && entry->GetState() != State_Up) |
| { // Only return entry if it has completed startup |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf( "%s@%d - Process %s (%d,%d:%d) not in 'Up' state" |
| ", checkstate=%d, state=%d, backup=%d\n" |
| , method_name, __LINE__ |
| , entry->GetName() |
| , entry->GetNid() |
| , entry->GetPid() |
| , entry->GetVerifier() |
| , checkstate |
| , entry->GetState() |
| , entry->IsBackup()); |
| entry = NULL; |
| } |
| |
| TRACE_EXIT; |
| return entry; |
| } |
| |
| CProcess *CProcessContainer::GetProcess( const char *name |
| , Verifier_t verifier |
| , bool checkstate ) |
| { |
| const char method_name[] = "CProcessContainer::GetProcess(name, verifier)"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| CProcess *entry = NULL; |
| |
| if ( strlen( name ) ) |
| { |
| entry = CProcessContainer::GetProcess( name, checkstate ); |
| } |
| |
| if ( entry ) |
| { |
| if ( (verifier != -1) && (verifier != entry->GetVerifier()) ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf( "%s@%d - Get (%s:%d) failed -- verifier mismatch (%d)\n" |
| , method_name, __LINE__ |
| , name |
| , verifier |
| , entry->GetVerifier() ); |
| } |
| entry = NULL; |
| } |
| } |
| |
| TRACE_EXIT; |
| return entry; |
| } |
| |
| CProcess *CProcessContainer::GetProcessByType (PROCESSTYPE type) |
| { |
| CProcess *entry = head_; |
| |
| const char method_name[] = "CProcessContainer::GetProcessByType"; |
| TRACE_ENTRY; |
| |
| if ( ! nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CNode (the physical node) |
| abort(); |
| } |
| |
| entry = entry->GetProcessByType( type ); |
| TRACE_EXIT; |
| |
| return entry; |
| } |
| |
| // see: CLNode::GetProcessL (int pid) |
| // see: CLNode::GetProcessL (char *name, bool checkstate) |
| |
| CProcess *CProcessContainer::GetProcessLByType(PROCESSTYPE type) |
| { |
| CProcess *entry = head_; |
| |
| const char method_name[] = "CProcessContainer::GetProcessByType"; |
| TRACE_ENTRY; |
| |
| if ( nodeContainer_ ) |
| { |
| // Programmer bonehead :^) |
| // This must only be called from CLNode (the logical node) |
| abort(); |
| } |
| |
| entry = entry->GetProcessLByType( type ); |
| |
| TRACE_EXIT; |
| |
| return entry; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::KillAll( STATE node_State, CProcess *requester ) |
| { |
| CProcess *process = NULL; |
| int nid; |
| |
| const char method_name[] = "CProcessContainer::KillAll"; |
| TRACE_ENTRY; |
| |
| nameMapLock_.lock(); |
| |
| nameMap_t::iterator nameMapIt; |
| nameMap_t::iterator nameMapItSave; |
| for ( nameMapIt = nameMap_->begin(); nameMapIt != nameMap_->end(); ) |
| { |
| process = nameMapIt->second; |
| assert( process ); |
| nameMapItSave = nameMapIt; |
| ++nameMapIt; |
| |
| nid = process->GetNid(); |
| if (( process->GetType() != ProcessType_Watchdog ) && |
| ( process != requester ) ) |
| { |
| if (node_State == State_Down) |
| { |
| int killedNid = process->GetNid(); |
| int killedPid = process->GetPid(); |
| bool killedIsClone = process->IsClone(); |
| |
| // Delete name map entry |
| nameMap_->erase(nameMapItSave); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), killedNid, |
| killedPid); |
| } |
| |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| // Set process to "stopped" state. SetProcessState |
| // will invoke Exit_Process so "process" is not |
| // valid after SetProcessState returns. |
| SetProcessState( process, State_Stopped, true, -1); |
| if ( nid == killedNid ) |
| { |
| if ( !killedIsClone && killedPid != -1) |
| { |
| kill (killedPid, SIGKILL); |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Completed kill for (%d, %d)\n", method_name, __LINE__, killedNid, killedPid); |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_RECOVERY | TRACE_SYNC_DETAIL | TRACE_TMSYNC | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d change process (%d, %d) state to down\n", method_name, __LINE__, process->GetNid(), process->GetPid()); |
| process->SetState (State_Down); |
| // Replicate the kill to other nodes |
| CReplKill *repl = new CReplKill( process->GetNid() |
| , process->GetPid() |
| , process->GetVerifier() |
| , process->GetAbort()); |
| Replicator.addItem(repl); |
| } |
| } |
| } |
| |
| nameMapLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::KillAllDown() |
| { |
| CProcess *process = NULL; |
| int nid = -1; |
| int pid = -1; |
| |
| const char method_name[] = "CProcessContainer::KillAllDown"; |
| TRACE_ENTRY; |
| |
| nameMap_t::iterator nameMapIt; |
| |
| while ( true ) |
| { |
| nameMapLock_.lock(); |
| nameMapIt = nameMap_->begin(); |
| |
| if (nameMap_->size() == 0) |
| { |
| nameMapLock_.unlock(); |
| break; // all done |
| } |
| |
| process = nameMapIt->second; |
| |
| // Delete name map entry |
| nameMap_->erase (nameMapIt); |
| |
| nameMapLock_.unlock(); |
| |
| nid = process->GetNid(); |
| pid = process->GetPid(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), nid, pid); |
| } |
| |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| // valid for virtual cluster only. |
| if ( !process->IsClone() && pid != -1 ) |
| { |
| // killing the process will not remove the process object because |
| // exit processing will get queued until this completes. |
| kill( pid, SIGKILL ); |
| PROCESSTYPE type = process->GetType(); |
| if ( type == ProcessType_TSE || |
| type == ProcessType_ASE ) |
| { |
| // unmount volume would acquire nameMapLock_ internally. |
| Devices->UnMountVolume( process->GetName(), process->IsBackup() ); |
| } |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Completed kill for %s (%d, %d)\n", method_name, __LINE__, process->GetName(), nid, pid); |
| } |
| |
| // Remove all processes |
| // PSD will re-create persistent processes on spare node activation |
| Exit_Process( process, true, nid ); |
| } |
| |
| // clean up clone processes on this node that do not have entries in |
| // nameMap_ or pidMap_ yet and restart persistent processes |
| CProcess *nextProc = NULL; |
| process = head_; |
| |
| while (process) |
| { |
| nextProc = process->GetNext(); |
| |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| Exit_Process( process, true, nid ); |
| |
| process = nextProc; |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::KillAllDownSoft() |
| { |
| const char method_name[] = "CProcessContainer::KillAllDownSoft"; |
| TRACE_ENTRY; |
| |
| CProcess *process = NULL; |
| int nid = -1; |
| int pid = -1; |
| PROCESSTYPE type; |
| nameMap_t::iterator nameMapIt; |
| |
| while ( true ) |
| { |
| nameMapLock_.lock(); |
| nameMapIt = nameMap_->begin(); |
| |
| if (nameMap_->size() == 0) |
| { |
| nameMapLock_.unlock(); |
| break; // all done |
| } |
| |
| process = nameMapIt->second; |
| |
| // Delete name map entry |
| nameMap_->erase (nameMapIt); |
| |
| nameMapLock_.unlock(); |
| |
| nid = process->GetNid(); |
| pid = process->GetPid(); |
| type = process->GetType(); |
| |
| if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf("%s@%d removed from nameMap %p: %s (%d, %d)\n", |
| method_name, __LINE__, nameMap_, |
| process->GetName(), nid, pid); |
| } |
| |
| // valid for virtual cluster or soft node down only. |
| if ( type != ProcessType_DTM && type != ProcessType_NameServer ) |
| { |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| // valid for virtual cluster only. |
| if ( !process->IsClone() && pid != -1 ) |
| { |
| // killing the process will not remove the process object because |
| // exit processing will get queued until this completes. |
| kill( pid, SIGKILL ); |
| PROCESSTYPE type = process->GetType(); |
| if ( type == ProcessType_TSE || |
| type == ProcessType_ASE ) |
| { |
| // unmount volume would acquire nameMapLock_ internally. |
| Devices->UnMountVolume( process->GetName(), process->IsBackup() ); |
| } |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Completed kill for %s (%d, %d)\n", method_name, __LINE__, process->GetName(), nid, pid); |
| } |
| // Remove all processes |
| // PSD will re-create persistent processes on spare node activation |
| Exit_Process( process, true, nid ); |
| } |
| } |
| |
| // clean up clone processes on this node that do not have entries in |
| // nameMap_ or pidMap_ yet and restart persistent processes |
| CProcess *nextProc = NULL; |
| process = head_; |
| |
| while (process) |
| { |
| nextProc = process->GetNext(); |
| |
| PROCESSTYPE type = process->GetType(); |
| if ( type != ProcessType_DTM && type != ProcessType_NameServer ) |
| { |
| // Delete pid map entry |
| DelFromPidMap ( process ); |
| |
| Exit_Process( process, true, nid ); |
| } |
| |
| process = nextProc; |
| } |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| char *CProcessContainer::NormalizeName (char *name) |
| { |
| char *ptr; |
| |
| const char method_name[] = "CProcessContainer::NormalizeName"; |
| TRACE_ENTRY; |
| ptr = name; |
| while (*ptr) |
| { |
| *ptr = toupper (*ptr); |
| ptr++; |
| } |
| TRACE_EXIT; |
| |
| return name; |
| } |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcessContainer::Open_Process (int nid, int pid, Verifier_t verifier, int death_notification, CProcess * process) |
| { |
| bool status = FAILURE; |
| CProcess *opener_process = NULL; |
| char la_buf[MON_STRING_BUF_SIZE]; |
| |
| const char method_name[] = "CProcessContainer::Open_Process"; |
| TRACE_ENTRY; |
| if (process) |
| { |
| opener_process = Nodes->GetLNode (nid)->GetProcessL(pid); |
| if (opener_process) |
| { |
| if ( (verifier != -1) && (verifier != process->GetVerifier()) ) |
| { |
| if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) |
| { |
| trace_printf("%s@%d - Opener (%d, %d:%d) not found -- verifier mismatch (%d)\n", |
| method_name, __LINE__, |
| nid, |
| pid, |
| verifier, |
| opener_process->GetVerifier()); |
| } |
| } |
| else |
| { |
| status = opener_process->Open (process,death_notification); |
| } |
| } |
| else |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Can't find opener process, Pid=%d.\n", |
| method_name, pid); |
| mon_log_write(MON_PROCESSCONT_OPENPROCESS_1, SQ_LOG_ERR, la_buf); |
| } |
| } |
| else |
| { |
| snprintf(la_buf, sizeof(la_buf), |
| "[%s], Failed, Can't find process.\n", method_name); |
| mon_log_write(MON_PROCESSCONT_OPENPROCESS_2, SQ_LOG_ERR, la_buf); |
| } |
| TRACE_EXIT; |
| |
| return status; |
| } |
| #endif |
| |
| #ifdef NAMESERVER_PROCESS |
| bool CProcessContainer::RestartPersistentProcess( CProcess *, int ) |
| { |
| return false; |
| } |
| #else |
| // |
| // Persistent process re-creation logic: |
| // |
| // o Process object is target of re-create |
| // o Process object type determines persist configuration template |
| // o Persist configuration template determines re-creation rules |
| // |
| // Re-creation rules: |
| // |
| // PROCESS_NAME format defines (%nid+/%nid) node re-creation scope |
| // $<prefix>%nid+ or $<prefix>%nid or $<name> |
| // |
| // PERSIST_ZONES format defines (%zid+/%zid) rules of re-creation within scope |
| // |
| // (%nid+) Nid_ALL = one process in each node |
| // Zid_ALL = n/a |
| // Zid_RELATIVE = recreate only in initial <nid> assigned |
| // (%nid ) Nid_RELATIVE = one process in cluster |
| // Zid_ALL = recreate in current up <nid> or next up <nid> |
| // Zid_RELATIVE = recreate only in initial <nid> assigned (non-HA) |
| // ( ) Nid_Undefined = one process in cluster |
| // Zid_ALL = recreate in current up <nid> or next up <nid> |
| // Zid_RELATIVE = recreate only in initial <nid> assigned (non-HA) |
| // |
| bool CProcessContainer::RestartPersistentProcess( CProcess *process, int downNid ) |
| { |
| const char method_name[] = "CProcessContainer::RestartPersistentProcess"; |
| TRACE_ENTRY; |
| |
| bool successful = false; |
| bool restart = false; |
| int nid = -1; |
| int max_retries = 3; |
| int retry_max_time = 1; |
| CNode *currenNode; |
| CNode *newNode; |
| CLNode *currentLNode; |
| CLNode *newLNode; |
| CProcess *parent = NULL; |
| CClusterConfig *clusterConfig = Nodes->GetClusterConfig(); |
| CPersistConfig *persistConfig = NULL; |
| |
| assert(clusterConfig != NULL); |
| |
| persistConfig = clusterConfig->GetPersistConfig( process->GetType() |
| , process->GetName() |
| , process->GetNid() ); |
| if (persistConfig) |
| { |
| max_retries = persistConfig->GetPersistRetries(); |
| retry_max_time = persistConfig->GetPersistWindow(); |
| } |
| else |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Persistent process %s not " |
| "restarted because the persist configuration is " |
| "missing.\n" |
| , method_name |
| , process->GetName() ); |
| mon_log_write(MON_PROCESS_PERSIST_2, SQ_LOG_ERR, buf); |
| return false; |
| } |
| |
| // if 1st time retrying to restart process |
| if (process->GetPersistentCreateTime() == 0) |
| { |
| process->SetFirstInstance(false); |
| process->SetPersistentCreateTime ( time(NULL) ); |
| } |
| |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Persistent process retries = %d, " |
| "time limit = %d, down nid=%d\n" |
| , method_name, __LINE__ |
| , max_retries, retry_max_time, downNid); |
| |
| // get the parent process if any |
| if (process->GetParentNid() != -1 && process->GetParentPid() != -1) |
| { |
| parent = Nodes->GetLNode( process->GetParentNid())->GetProcessL(process->GetParentPid() ); |
| } |
| |
| currentLNode = Nodes->GetLNode( process->GetNid() ); |
| newLNode = Nodes->GetLNodeNext( process->GetNid() ); |
| |
| switch (persistConfig->GetProcessNameNidFormat()) |
| { |
| case Nid_ALL: // one process in each <nid> |
| switch (persistConfig->GetZoneZidFormat()) |
| { |
| case Zid_ALL: // n/a |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], Persistent process %s not " |
| "restarted because the persist configuration is " |
| "inconsistent for key %s.\n" |
| , method_name |
| , process->GetName() |
| , persistConfig->GetPersistPrefix() ); |
| mon_log_write(MON_PROCESS_PERSIST_2, SQ_LOG_ERR, buf); |
| return false; |
| case Zid_RELATIVE: // recreate only in initial <nid> assigned |
| default: |
| // Is this a node down and node going down is process' node? |
| if ( downNid != -1 && currentLNode->GetNid() == downNid ) |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n" |
| , method_name, __LINE__ |
| , currentLNode->GetNid() |
| , downNid ); |
| } |
| else |
| { |
| if ( currentLNode->GetState() == State_Up) |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d - original node is available, nid=%d\n" |
| , method_name, __LINE__, process->GetNid()); |
| } |
| |
| if ( MyNode->IsMyNode(process->GetNid()) ) |
| { |
| restart = true; |
| } |
| } |
| else |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n" |
| , method_name, __LINE__ |
| , currentLNode->GetNid() |
| , downNid ); |
| } |
| } |
| } // switch |
| break; |
| case Nid_RELATIVE: // one process in cluster |
| case Nid_Undefined: // one process in cluster |
| default: |
| switch (persistConfig->GetZoneZidFormat()) |
| { |
| case Zid_ALL: // recreate in current up <nid> or next up <nid> |
| // check if we need to do something because the node is down and |
| // spare node is not activating |
| if ((downNid != -1 && !currentLNode->GetNode()->IsSpareNode()) || |
| currentLNode->GetState() == State_Down ) |
| { |
| nid = (newLNode) ? newLNode->GetNid() : -1; |
| if ( newLNode && |
| (newLNode->GetState() == State_Up && |
| newLNode->GetNid() != downNid ) ) |
| { |
| if (MyNode->IsMyNode(nid)) |
| { |
| // OK we need to move the process to our node |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Moving process from nid=%d to new nid=%d\n" |
| , method_name, __LINE__ |
| , process->GetNid(), nid); |
| currenNode = currentLNode->GetNode(); |
| currenNode->RemoveFromList(process); |
| process->SetNid ( nid ); |
| process->SetPid ( -1 ); |
| newNode = newLNode->GetNode(); |
| newNode->AddToList( process ); |
| process->SetClone( false ); |
| // Replicate the clone to other nodes |
| CReplClone *repl = new CReplClone(process); |
| Replicator.addItem(repl); |
| restart = true; |
| } |
| else |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Not moving process from nid=%d to nid=%d""\n" |
| , method_name, __LINE__ |
| , process->GetNid(), nid); |
| } |
| } |
| else |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - Next possible node is not available, nid=%d\n" |
| , method_name, __LINE__, nid); |
| } |
| } |
| else |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - original node is available, nid=%d\n" |
| , method_name, __LINE__, process->GetNid()); |
| |
| if ( MyNode->IsMyNode(process->GetNid()) ) |
| { |
| restart = true; |
| } |
| } |
| break; |
| case Zid_RELATIVE: // recreate only in initial <nid> assigned (non-HA) |
| default: |
| // Is this a node down and node going down is process' node? |
| if ( downNid != -1 && currentLNode->GetNid() == downNid ) |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n" |
| , method_name, __LINE__ |
| , currentLNode->GetNid() |
| , downNid ); |
| } |
| else |
| { |
| if ( currentLNode->GetState() == State_Up) |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| { |
| trace_printf( "%s@%d - original node is available, nid=%d\n" |
| , method_name, __LINE__, process->GetNid()); |
| } |
| |
| if ( MyNode->IsMyNode(process->GetNid()) ) |
| { |
| restart = true; |
| } |
| } |
| else |
| { |
| if (trace_settings & |
| (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf( "%s@%d - original node is not available, nid=%d, downNid=%d\n" |
| , method_name, __LINE__ |
| , currentLNode->GetNid() |
| , downNid ); |
| } |
| } |
| } |
| break; |
| } |
| |
| if ( Nodes->IsShutdownActive() ) |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Shutdown process " "%s" " on nid=" "%d" "\n", method_name, __LINE__, process->GetName(), process->GetNid()); |
| successful = false; |
| } |
| else |
| { |
| // Re-initialize process flags |
| process->SetState (State_Unknown); |
| |
| if (( restart ) && |
| ( MyNode->IsMyNode(process->GetNid()) )) |
| { |
| // check if we should retry to create the process |
| if ( (time(NULL) - process->GetPersistentCreateTime()) < retry_max_time ) |
| { |
| int retryCount = process->GetPersistentRetries(); |
| if ( retryCount < max_retries ) |
| { |
| ++retryCount; |
| process->SetPersistentRetries ( retryCount ); |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d - Max retries exceeded for " |
| "process %s, retry count=%d, max " |
| "retries=%d\n", method_name, __LINE__, |
| process->GetName(), retryCount, |
| max_retries); |
| |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| snprintf(buf, sizeof(buf), "[%s], Persistent process %s " |
| "not restarted because the maximum retry count " |
| "(%d) has been exceeded.\n", |
| method_name, process->GetName(), retryCount); |
| mon_log_write(MON_PROCESS_PERSIST_1, SQ_LOG_INFO, buf); |
| |
| if ( process->GetType() == ProcessType_DTM || |
| process->GetType() == ProcessType_PSD || |
| process->GetType() == ProcessType_TMID || |
| process->GetType() == ProcessType_Watchdog || |
| process->GetType() == ProcessType_SMS ) |
| { |
| if ( process->GetType() == ProcessType_DTM ) |
| { |
| MyNode->SetDTMAborted( true ); |
| } |
| if ( process->GetType() == ProcessType_SMS ) |
| { |
| MyNode->SetSMSAborted( true ); |
| } |
| |
| snprintf(buf, sizeof(buf), "[%s], Critial persistent process %s " |
| "not restarted, " |
| "scheduling node down on node %s (%d)!\n", |
| method_name, process->GetName(), MyNode->GetName(), MyPNID); |
| mon_log_write(MON_PROCESS_PERSIST_4, SQ_LOG_CRIT, buf); |
| |
| ReqQueue.enqueueDownReq(MyPNID); |
| } |
| |
| return false; |
| } |
| } |
| else |
| { |
| process->SetPersistentRetries ( 0 ); |
| if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) |
| trace_printf("%s@%d" " - Retries count reset for process " "%s" "\n", method_name, __LINE__, process->GetName()); |
| } |
| |
| if ( process->GetType() == ProcessType_DTM ) |
| { |
| // Kill all local processes |
| Monitor->SoftNodeDown( MyPNID ); |
| } |
| |
| // OK ... just restart the process on the same node |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Restarting process " "%s" " on nid=" "%d" "\n", method_name, __LINE__, process->GetName(), process->GetNid()); |
| process->SetDeletePending ( false ); |
| process->SetStartupCompleted ( false ); |
| process->SetPriorPid( !MyNode->IsSpareNode() ? process->GetPid() : 0 ); |
| process->SetClone( false ); |
| int result; |
| successful = process->Create(parent, 0, result); |
| if (successful) |
| { |
| process->SetAbended( false ); |
| Nodes->GetLNode (process->GetNid())->GetNode() |
| ->AddToNameMap(process); |
| Nodes->GetLNode (process->GetNid())->GetNode() |
| ->AddToPidMap(process->GetPid(), process); |
| process->SetPersistentCreateTime ( time(NULL) ); |
| if ( process->GetType() == ProcessType_SSMP ) |
| { |
| Nodes->GetLNode ( process->GetNid() )->SetSSMProc ( process ); |
| } |
| } |
| else |
| { |
| if ( process->GetType() == ProcessType_DTM ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf( buf, sizeof(buf) |
| , "[%s], DTM (%s) persistent restart failed, Node %s going down\n" |
| , method_name, process->GetName(), MyNode->GetName()); |
| mon_log_write(MON_PROCESS_PERSIST_6, SQ_LOG_INFO, buf); |
| |
| snprintf( buf, sizeof(buf), |
| "DTM (%s) persistent restart failed, Node %s going down\n", |
| process->GetName(), MyNode->GetName()); |
| genSnmpTrap( buf ); |
| |
| // DTM just died unexpectedly, so bring the node down |
| Monitor->HardNodeDown(MyPNID, true); |
| } |
| } |
| } |
| else |
| { |
| if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) |
| trace_printf("%s@%d" " - Not restarting process " "%s" " on nid=" "%d" "\n", method_name, __LINE__, process->GetName(), process->GetNid()); |
| successful = restart; |
| } |
| } |
| |
| TRACE_EXIT; |
| |
| return successful; |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::PidHangupSet ( int pid ) |
| { |
| hungupPidsLock_.lock(); |
| hungupPids_.insert ( pid ); |
| hungupPidsLock_.unlock(); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::PidHangupClear ( int pid ) |
| { |
| hungupPidsLock_.lock(); |
| hungupPids_.erase ( pid ); |
| hungupPidsLock_.unlock(); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::CheckFdState ( int fd ) |
| { |
| const char method_name[ ] = "CProcessContainer::CheckFdState"; |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| int epollfd = epoll_create(5); |
| if (epollfd == -1) |
| { |
| snprintf(buf, sizeof(buf), "[%s], epoll_create error, %s (%d)\n", |
| method_name, strerror(errno), errno); |
| mon_log_write(MON_PROCESS_CHECKFDSTATE_1, SQ_LOG_ERR, buf); |
| |
| return; |
| } |
| |
| // Add file descriptor to epoll set |
| struct epoll_event ev; |
| memset(&ev, 0, sizeof(ev)); |
| ev.events = EPOLLIN; |
| ev.data.fd = fd; |
| |
| if ((epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) |
| && (errno != EEXIST)) |
| { |
| snprintf(buf, sizeof(buf), |
| "[%s], epoll_ctl error, adding fd=%d, %s (%d)\n", |
| method_name, fd, strerror(errno), errno); |
| mon_log_write(MON_PROCESS_CHECKFDSTATE_2, SQ_LOG_ERR, buf); |
| |
| return; |
| } |
| |
| // see if hangup is still asserted on stderr |
| struct epoll_event event_list[10]; |
| int ready_fds = epoll_wait (epollfd, event_list, 10, 0); |
| |
| if (ready_fds == -1) |
| { // epoll_wait error |
| snprintf(buf, sizeof(buf), "[%s], epoll_wait error, %s (%d)\n", |
| method_name, strerror(errno), errno); |
| mon_log_write(MON_PROCESS_CHECKFDSTATE_3, SQ_LOG_ERR, buf); |
| } |
| else if (ready_fds != 0) |
| { |
| for (int n=0; n < ready_fds; n++) |
| { |
| snprintf(buf, sizeof(buf), |
| "[%s], for fd=%d, events=%d\n", method_name, |
| event_list[n].data.fd, event_list[n].events); |
| mon_log_write(MON_PROCESS_CHECKFDSTATE_4, SQ_LOG_INFO, buf); |
| } |
| } |
| else |
| { // Indicate the epoll hangup no longer asserted |
| snprintf(buf, sizeof(buf), |
| "[%s], No events pending for fd=%d\n", method_name, fd); |
| mon_log_write(MON_PROCESS_CHECKFDSTATE_5, SQ_LOG_INFO, buf); |
| } |
| |
| close( epollfd ); |
| } |
| #endif |
| |
| #ifndef NAMESERVER_PROCESS |
| void CProcessContainer::PidHangupCheck ( time_t now ) |
| { |
| const char method_name[ ] = "CProcessContainer::PidHangupCheck"; |
| TRACE_ENTRY; |
| char buf[MON_STRING_BUF_SIZE]; |
| |
| // Examine the list of processes for which we have received a |
| // pipe hangup indication but have not received a child death |
| // signal. |
| hungupPidsLock_.lock(); |
| int pid; |
| for (hungupPids_t::const_iterator it = hungupPids_.begin(); |
| it != hungupPids_.end();) |
| { |
| pid = *it; |
| ++it; |
| |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d process %d is in hangup list\n", |
| method_name, __LINE__, pid); |
| } |
| |
| CProcess * process = GetProcess (pid); |
| time_t hangupTime = 0; |
| |
| if (process) |
| { |
| hangupTime = process->GetHangupTime(); |
| if ( now < (hangupTime + PROCESS_DEATH_MARGIN) ) |
| { // Process hangup detected recently. Wait a while before |
| // taking action on this process. This allows time for |
| // child death signal to arrive. |
| |
| // temp trace |
| if (trace_settings & TRACE_PROCESS) |
| { |
| trace_printf("%s@%d process %d not yet ripe\n", |
| method_name, __LINE__, pid); |
| } |
| |
| continue; |
| } |
| } |
| |
| // See if process is still alive |
| if (kill(pid,0) == -1) |
| { |
| if (errno == ESRCH) |
| { // Process no longer exists |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d process %d no longer exists\n", |
| method_name, __LINE__, pid); |
| // Log info |
| snprintf(buf, sizeof(buf), |
| "[%s], process %d no longer exists, initiating " |
| "exit processing\n", method_name, pid); |
| mon_log_write(MON_PROCESS_PIDHANGUPCHECK_1, SQ_LOG_INFO, buf); |
| |
| // Remove from set |
| hungupPids_.erase ( pid ); |
| |
| // set state process |
| // Queue request for processing by worker thread |
| ReqQueue.enqueueChildDeathReq ( pid ); |
| |
| // release buffers |
| // todo |
| } |
| else |
| { |
| int saveerrno = errno; |
| |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d process %d, errno=%d (%p)\n", |
| method_name, __LINE__, pid, saveerrno, |
| strerror(saveerrno)); |
| |
| // Log info |
| snprintf(buf, sizeof(buf), |
| "[%s], error getting process %d info, %s (%d)\n", |
| method_name, pid, strerror(saveerrno), saveerrno); |
| mon_log_write(MON_PROCESS_PIDHANGUPCHECK_2, SQ_LOG_INFO, buf); |
| } |
| } |
| else |
| { |
| char timestring[50]; |
| if (process) |
| { |
| strcpy(timestring, ctime ( &hangupTime )); |
| timestring[strlen(timestring)-1] = '\0'; |
| } |
| else |
| { |
| strcpy(timestring, "unknown"); |
| } |
| |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf("%s@%d process %d (%s) still running, no child " |
| "death indication received (hangup at %s)\n", |
| method_name, __LINE__, pid, |
| ((process != NULL) ? process->GetName(): "unknown"), |
| timestring); |
| |
| // Log info |
| snprintf(buf, sizeof(buf), |
| "[%s], process %d (%s) still running, no child death " |
| "indication received (hangup at %s)\n", method_name, pid, |
| ((process != NULL) ? process->GetName() : "unknown"), |
| timestring); |
| mon_log_write(MON_PROCESS_PIDHANGUPCHECK_3, SQ_LOG_INFO, buf); |
| |
| |
| if (process) |
| CheckFdState( process->FdStderr() ); |
| |
| // Possibly kill process after sufficient time has elapsed |
| // todo |
| } |
| } |
| hungupPidsLock_.unlock(); |
| |
| TRACE_EXIT; |
| } |
| #endif |
| |
| void CProcessContainer::SetProcessState( CProcess *process, STATE state, bool abend, int downNode ) |
| { |
| const char method_name[] = "CProcessContainer::SetProcessState(process)"; |
| TRACE_ENTRY; |
| |
| if ( process ) |
| { |
| switch ( state ) |
| { |
| case State_Down: |
| // Process intends to exits, when the child death arrives the |
| // State_Stopped is processed |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf( "%s@%d Setting State_Down for process %s(%d,%d:%d), abend=%d, down=%d\n" |
| , method_name, __LINE__ |
| , process->GetName() |
| , process->GetNid() |
| , process->GetPid() |
| , process->GetVerifier() |
| , abend, downNode ); |
| process->SetState( State_Down ); |
| if ( abend && !process->IsAbended() ) |
| { |
| process->SetAbended( abend ); |
| } |
| break; |
| |
| case State_Stopped: |
| if ( process->GetState() != State_Stopped ) |
| { |
| // Process terminated so handle the exit processing. |
| // Termination detected through a child death signal or |
| // a broken stderr pipe for an attached process. |
| |
| // Note: Exit_Process() will delete the process object, so |
| // save the process information needed before the call |
| #ifndef NAMESERVER_PROCESS |
| PROCESSTYPE processType = process->GetType(); |
| #endif |
| string processName = process->GetName(); |
| int processNid = process->GetNid(); |
| int processPid = process->GetPid(); |
| Verifier_t processVerifier = process->GetVerifier(); |
| #ifndef NAMESERVER_PROCESS |
| Exit_Process( process, abend, downNode ); |
| #endif |
| if (trace_settings & TRACE_PROCESS) |
| trace_printf( "%s@%d Set State_Stopped for process %s(%d,%d:%d), abend=%d, down=%d, " |
| "killingMyNode=%d,DTM aborted=%d, SMS aborted=%d\n" |
| , method_name, __LINE__ |
| , processName.c_str(), processNid, processPid, processVerifier |
| , abend, downNode |
| , MyNode->IsKillingNode(), MyNode->IsDTMAborted(), MyNode->IsSMSAborted()); |
| #ifndef NAMESERVER_PROCESS |
| if ( !MyNode->IsKillingNode() ) |
| { |
| switch ( processType ) |
| { |
| case ProcessType_DTM: |
| if ( MyNode->GetState() != State_Shutdown && |
| MyNode->IsDTMAborted() ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], DTM (%s) aborted, Node %s going down\n", |
| method_name, processName.c_str(), MyNode->GetName()); |
| mon_log_write(MON_PROCESS_SETSTATE_1, SQ_LOG_INFO, buf); |
| |
| snprintf( buf, sizeof(buf), |
| "DTM (%s) aborted, Node %s going down\n", |
| processName.c_str(), MyNode->GetName()); |
| genSnmpTrap( buf ); |
| |
| // DTM just died unexpectedly, so bring the node down |
| Monitor->HardNodeDown(MyPNID, true); |
| } |
| break; |
| case ProcessType_SMS: |
| if ( MyNode->GetState() != State_Shutdown && |
| MyNode->IsSMSAborted() ) |
| { |
| char buf[MON_STRING_BUF_SIZE]; |
| snprintf(buf, sizeof(buf), |
| "[%s], SMS (%s) aborted, Node %s going down\n", |
| method_name, processName.c_str(), MyNode->GetName()); |
| mon_log_write(MON_PROCESS_SETSTATE_2, SQ_LOG_INFO, buf); |
| |
| snprintf( buf, sizeof(buf), |
| "SMS (%s) aborted, Node %s going down\n", |
| processName.c_str(), MyNode->GetName()); |
| genSnmpTrap( buf ); |
| |
| // SMS just died unexpectedly, so bring the node down |
| Monitor->HardNodeDown(MyPNID, true); |
| } |
| break; |
| default: // no special handling |
| break; |
| } |
| } |
| #endif |
| } |
| break; |
| default: |
| process->SetState( state ); |
| break; |
| } |
| } |
| |
| TRACE_EXIT; |
| } |
| |
| |
| |
| #ifndef NAMESERVER_PROCESS |
| bool CProcessContainer::WhoEnlisted( _TM_Txid_External trans_id, struct message_def *msg ) |
| { |
| int idx; |
| CProcess *process = head_; |
| CNotice *notice; |
| |
| const char method_name[] = "CProcessContainer::WhoEnlisted"; |
| TRACE_ENTRY; |
| while ((process) && |
| (msg->u.reply.u.trans_info.num_processes < MAX_PROC_LIST )) |
| { |
| notice = process->GetNoticeHead(); |
| while (notice) |
| { |
| if ( isEqual( notice->TransID, trans_id ) ) |
| { |
| idx = msg->u.reply.u.trans_info.num_processes; |
| msg->u.reply.u.trans_info.procs[idx].nid = process->GetNid(); |
| msg->u.reply.u.trans_info.procs[idx].pid = process->GetPid(); |
| msg->u.reply.u.trans_info.procs[idx].trans_id = trans_id; |
| msg->u.reply.u.trans_info.num_processes++; |
| if (msg->u.reply.u.trans_info.num_processes >= MAX_PROC_LIST) |
| { |
| msg->u.reply.u.trans_info.return_code = MPI_ERR_TRUNCATE; |
| return FAILURE; |
| } |
| break; |
| } |
| notice = notice->GetNext(); |
| } |
| process = process->GetNext(); |
| } |
| |
| TRACE_EXIT; |
| return SUCCESS; |
| } |
| #endif |