blob: f2762c048e0a6ab22dac040de3b08cb486587766 [file] [log] [blame]
///////////////////////////////////////////////////////////////////////////////
//
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
//
///////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include "reqqueue.h"
#include "montrace.h"
#include "monsonar.h"
#include "monlogging.h"
#include "replicate.h"
#include "mlio.h"
extern CMonitor *Monitor;
extern CMonStats *MonStats;
extern CNodeContainer *Nodes;
extern CReplicate Replicator;
extern CNode *MyNode;
extern int MyPNID;
CExtKillReq::CExtKillReq (reqQueueMsg_t msgType, int pid,
struct message_def *msg )
: CExternalReq(msgType, pid, msg)
{
// Add eyecatcher sequence as a debugging aid
memcpy(&eyecatcher_, "RQEF", 4);
}
CExtKillReq::~CExtKillReq()
{
// Alter eyecatcher sequence as a debugging aid to identify deleted object
memcpy(&eyecatcher_, "rqef", 4);
}
void CExtKillReq::populateRequestString( void )
{
char strBuf[MON_STRING_BUF_SIZE/2] = { 0 };
snprintf( strBuf, sizeof(strBuf),
"ExtReq(%s) req #=%ld "
"requester(name=%s/nid=%d/pid=%d/os_pid=%d/verifier=%d) "
"target(name=%s/nid=%d/pid=%d/verifier=%d)"
, CReqQueue::svcReqType[reqType_], getId()
, msg_->u.request.u.kill.process_name
, msg_->u.request.u.kill.nid
, msg_->u.request.u.kill.pid
, pid_
, msg_->u.request.u.kill.verifier
, msg_->u.request.u.kill.target_process_name
, msg_->u.request.u.kill.target_nid
, msg_->u.request.u.kill.target_pid
, msg_->u.request.u.kill.target_verifier );
requestString_.assign( strBuf );
}
void CExtKillReq::Kill( CProcess *process )
{
CNode *node = NULL;
CLNode *lnode = NULL;
const char method_name[] = "CMonitor::Kill";
TRACE_ENTRY;
process->SetAbended ( true );
process->SetState( State_Down );
lnode = Nodes->GetLNode( process->GetNid() );
node = lnode->GetNode();
if ( (node->GetState() == State_Up ||
node->GetState() == State_Shutdown) && node->GetPNid() != MyPNID )
{
// Replicate the kill to other nodes
CReplKill *repl = new CReplKill( process->GetNid()
, process->GetPid()
, process->GetVerifier()
, process->GetAbort());
Replicator.addItem(repl);
}
else
{
// if the node is not up, then the process was or will be killed
if ( !process->IsClone() &&
(node->GetState() == State_Up ||
node->GetState() == State_Shutdown) )
{
kill (process->GetPid(), Monitor->GetProcTermSig());
// Save the pid/verifier to cleanup LIO buffers after SIGCHLD
SQ_theLocalIOToClient->addToVerifierMap( process->GetPid()
, process->GetVerifier() );
if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Completed kill for process %s (%d, %d:%d)\n",
method_name, __LINE__, process->GetName(),
process->GetNid(), process->GetPid(), process->GetVerifier());
CProcess *parent = Nodes->GetProcess( process->GetParentNid(),
process->GetParentPid() );
process->Switch(parent); // switch process pair roles if needed
}
}
TRACE_EXIT;
}
void CExtKillReq::performRequest()
{
bool status = FAILURE;
CProcess *process = NULL;
CProcess *backup = NULL;
const char method_name[] = "CExtKillReq::performRequest";
TRACE_ENTRY;
// Record statistics (sonar counters)
if (sonar_verify_state(SONAR_ENABLED | SONAR_MONITOR_ENABLED))
MonStats->req_type_kill_Incr();
// Trace info about request
if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
{
trace_printf( "%s@%d request #%ld: Kill, requester %s (%d, %d:%d), "
"target %s (%d, %d:%d), persistent "
"abort=%d\n", method_name, __LINE__, id_,
msg_->u.request.u.kill.process_name,
msg_->u.request.u.kill.nid,
msg_->u.request.u.kill.pid,
msg_->u.request.u.kill.verifier,
msg_->u.request.u.kill.target_process_name,
msg_->u.request.u.kill.target_nid,
msg_->u.request.u.kill.target_pid,
msg_->u.request.u.kill.target_verifier,
msg_->u.request.u.kill.persistent_abort);
}
nid_ = msg_->u.request.u.kill.nid;
verifier_ = msg_->u.request.u.kill.verifier;
processName_ = msg_->u.request.u.kill.process_name;
int target_nid = -1;
int target_pid = -1;
string target_process_name;
Verifier_t target_verifier = -1;
CProcess *requester = NULL;
target_nid = msg_->u.request.u.kill.target_nid;
target_pid = msg_->u.request.u.kill.target_pid;
target_process_name = (const char *) msg_->u.request.u.kill.target_process_name;
target_verifier = msg_->u.request.u.kill.target_verifier;
if ( processName_.size() )
{ // find by name
requester = MyNode->GetProcess( processName_.c_str()
, verifier_ );
}
else
{ // find by pid
requester = MyNode->GetProcess( pid_
, verifier_ );
}
if ( requester )
{
if ( target_process_name.size() )
{ // find by name (check node state, don't check process state, not backup)
process = Nodes->GetProcess( target_process_name.c_str()
, target_verifier
, true, false, false );
if ( process &&
(msg_->u.request.u.kill.target_nid == -1 ||
msg_->u.request.u.kill.target_pid == -1))
{
backup = process->GetBackup ();
}
}
else
{ // find by nid (check node state, don't check process state, backup is Ok)
process = Nodes->GetProcess( target_nid
, target_pid
, target_verifier
, true, false, true );
backup = NULL;
}
if (process)
{
process->SetAbort( msg_->u.request.u.kill.persistent_abort );
if (backup)
{
// We are killing both the primary and backup processes
Kill( backup );
}
Kill( process );
msg_->u.reply.type = ReplyType_Generic;
msg_->u.reply.u.generic.nid = process->GetNid();
msg_->u.reply.u.generic.pid = process->GetPid();
msg_->u.reply.u.generic.verifier = process->GetVerifier();
strcpy (msg_->u.reply.u.generic.process_name, process->GetName());
msg_->u.reply.u.generic.return_code = MPI_SUCCESS;
status = SUCCESS;
}
else
{
if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
{
trace_printf( "%s@%d - Kill %s (%d, %d:%d) -- can't find target process\n"
, method_name, __LINE__
, msg_->u.request.u.kill.target_process_name
, msg_->u.request.u.kill.target_nid
, msg_->u.request.u.kill.target_pid
, msg_->u.request.u.kill.target_verifier);
}
}
}
else
{
if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - Can't find requester, rc=%d\n", method_name, __LINE__, MPI_ERR_NAME);
}
if (status == FAILURE)
{
msg_->u.reply.type = ReplyType_Generic;
msg_->u.reply.u.generic.nid = -1;
msg_->u.reply.u.generic.pid = -1;
msg_->u.reply.u.generic.verifier = -1;
msg_->u.reply.u.generic.process_name[0] = '\0';
msg_->u.reply.u.generic.return_code = MPI_ERR_NAME;
if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
trace_printf("%s@%d - unsuccessful\n", method_name, __LINE__);
}
// Send reply to requester
lioreply(msg_, pid_);
TRACE_EXIT;
}