blob: 1438becfd941b633feae18f0cea9722e538e8790 [file] [log] [blame]
///////////////////////////////////////////////////////////////////////////////
//
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
//
///////////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <signal.h>
#include <string.h>
#include <iostream>
#include <unistd.h>
using namespace std;
#include "clio.h"
#include "monlogging.h"
#include "montrace.h"
#include "msgdef.h"
#include "lock.h"
#include "pkillall.h"
#include "procmon.h"
#include "watchdog.h"
#include "sdtimer.h"
#include "gentrap.h"
#define LUNMGR_RETRY_MAX 3
// The following defines specify the default values for the timers if the
// softdog timer related variables are not defined.
#define SDT_KeepAliveTimerDefault 5 // in seconds
extern CWatchdog *Watchdog;
extern CProcessMonitor *ProcessMonitor;
static void *SoftdogThread( void *arg )
{
const char method_name[] = "SoftdogThread";
TRACE_ENTRY;
// Parameter passed to the thread is the CSdTimer object
CSdTimer *sdTimer = (CSdTimer *) arg;
if ( trace_settings & TRACE_INIT )
{
trace_printf( "%s@%d Thread started\n", method_name, __LINE__ );
}
// Mask all allowed signals except SIGPROF
sigset_t mask;
sigfillset( &mask);
sigdelset( &mask, SIGPROF ); // allows profiling such as google profiler
int rc = pthread_sigmask( SIG_SETMASK, &mask, NULL );
if ( rc != 0 )
{
char buf[MON_STRING_BUF_SIZE];
sprintf( buf, "[%s], pthread_sigmask error=%d\n", method_name, rc );
monproc_log_write( MON_SDTIMER_SOFTDOG_TH_1, SQ_LOG_ERR, buf );
}
sdTimer->SoftdogTimer();
if ( trace_settings & TRACE_INIT )
{
trace_printf( "%s@%d EXIT thread\n", method_name, __LINE__ );
}
TRACE_EXIT;
pthread_exit( (void *)errno );
return( (void *)errno );
}
CSdTimer::CSdTimer()
:CLock()
,state_(SDT_DISABLED)
,dumpMonitor_(false)
,killingNode_(false)
,softdog_(false)
,sdtKeepAliveTimerValue_(SDT_KeepAliveTimerDefault)
,threadId_(0)
,sdtLastMonRefreshCtr_(0)
{
const char method_name[] = "CSdTimer::CSdTimer";
TRACE_ENTRY;
// Add eyecatcher sequence as a debugging aid
memcpy(&eyecatcher_, "SDTM", 4);
char *wdtKeepAliveTimerValueC;
int wdtKeepAliveTimerValue;
if ( (wdtKeepAliveTimerValueC = getenv( "SQ_WDT_KEEPALIVETIMERVALUE" )) )
{
// in seconds
wdtKeepAliveTimerValue = atoi( wdtKeepAliveTimerValueC );
sdtKeepAliveTimerValue_ = wdtKeepAliveTimerValue;
}
clock_gettime(CLOCK_REALTIME, &expiredTime_);
expiredTime_.tv_sec += sdtKeepAliveTimerValue_;
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d" " - KeepAlive Timer in seconds =%ld\n"
, method_name, __LINE__, sdtKeepAliveTimerValue_ );
trace_printf("%s@%d" " - Start time %ld(secs):%ld(nsecs)\n"
, method_name, __LINE__, expiredTime_.tv_sec, expiredTime_.tv_nsec);
}
char *env = getenv( "SQ_WDT_DUMP_MONITOR" );
if (env && strcmp( env, "1" ) == 0)
{
dumpMonitor_ = true;
}
if (trace_settings & TRACE_INIT)
{
trace_printf( "%s@%d" " - Dump monitor process, dumpMonitor_=%d\n"
, method_name, __LINE__, dumpMonitor_ );
}
TRACE_EXIT;
}
CSdTimer::~CSdTimer( void )
{
const char method_name[] = "CSdTimer::~CSdTimer";
TRACE_ENTRY;
// Alter eyecatcher sequence as a debugging aid to identify deleted object
memcpy(&eyecatcher_, "sdtm", 4);
TRACE_EXIT;
}
int CSdTimer::DumpMonitorProcess( void )
{
const char method_name[] = "CSdTimer::DumpMonitorProcess";
TRACE_ENTRY;
int rc = 0;
if ( IsDumpMonitor() )
{
CUtility gCore( "gcore" );
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Generating monitor core!\n", method_name, __LINE__);
}
char pidstr[5];
sprintf(pidstr, "%d", gp_local_mon_io->get_monitor_pid());
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "Generating monitor core %s\n", pidstr);
monproc_log_write( MON_SDTIMER_STOPPROCESSES_1, SQ_LOG_ERR, la_buf);
// save, close and restore stdin
int savedStdIn = dup(STDIN_FILENO);
close(STDIN_FILENO);
// kill all processes
rc = gCore.ExecuteCommand( pidstr );
if ( rc == -1 )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Error= Can't execute 'gcore' command!\n", method_name);
monproc_log_write( MON_SDTIMER_DUMPMONITORPROC_1, SQ_LOG_ERR, la_buf);
dup2(savedStdIn, STDIN_FILENO);
close(savedStdIn);
TRACE_EXIT;
return( rc );
}
dup2(savedStdIn, STDIN_FILENO);
close(savedStdIn);
}
TRACE_EXIT;
return( rc );
}
bool CSdTimer::IsMonitorInDebug( void )
{
const char method_name[] = "CSdTimer::IsMonitorInDebug";
TRACE_ENTRY;
bool inDebug = false;
char buffer[132];
char filepath[MAX_PROCESS_PATH];
FILE *procMonitorStatusFile; // "/proc/%d/status" file pointer
memset( buffer, 0, sizeof(buffer) );
sprintf (filepath, "/proc/%d/status", gp_local_mon_io->get_monitor_pid());
procMonitorStatusFile = fopen(filepath, "r");
if ( !procMonitorStatusFile )
{
char buf[MON_STRING_BUF_SIZE];
int err = errno;
sprintf(buf, "[%s], Cannot monitor process status open %s, %s (%d)\n"
, method_name, filepath, strerror(err), err);
monproc_log_write(MON_SDTIMER_MONITORINDEBUG_1, SQ_LOG_ERR, buf);
TRACE_EXIT;
return( false );
}
if ( procMonitorStatusFile != NULL )
{
int totalProcStatusFound = 0;
int value;
const char *procGdbPidString = "TracerPid";
size_t procGdbPidStringLen = strlen ( procGdbPidString );
// Examine each /proc/%d/status item for "TracerPid"
// to determine if monitor process is in debug
while( !feof( procMonitorStatusFile ) )
{
fgets( buffer, 132, procMonitorStatusFile );
if ( strncmp( buffer, procGdbPidString, procGdbPidStringLen) == 0 )
{
value = atoi( &buffer[procGdbPidStringLen+1] );
if ( value != 0 )
{
inDebug = true;
}
}
++totalProcStatusFound;
if ( totalProcStatusFound == procFinalItem )
{
break;
}
}
fclose ( procMonitorStatusFile );
}
TRACE_EXIT;
return( inDebug );
}
void CSdTimer::NodeFailSafe( bool timerExpired, bool shutdown )
{
const char method_name[] = "CSdTimer::NodeFailSafe";
TRACE_ENTRY;
if( getenv("SQ_VIRTUAL_NODES") )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Node fail safe is not supported in a virtual cluster!\n", method_name);
monproc_log_write( MON_SDTIMER_NODEFAILSAFE_1, SQ_LOG_INFO, la_buf);
return;
}
else
{
if ( shutdown )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Node shutdown triggered - Node shutting down! \n", method_name );
monproc_log_write( MON_SDTIMER_NODEFAILSAFE_2, SQ_LOG_CRIT, la_buf);
}
else
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Node fail safe triggered - Node going down! Last refreshed at %ld(secs) %ld(nsecs). \n",
method_name, expiredTime_.tv_sec - sdtKeepAliveTimerValue_, expiredTime_.tv_nsec);
monproc_log_write( MON_SDTIMER_NODEFAILSAFE_3, SQ_LOG_CRIT, la_buf);
}
}
int rc = 0;
if ( IsSoftdogEnabled() )
{
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Node going down!\n", method_name, __LINE__);
}
if ( timerExpired )
{
SuspendMonitorProcess();
}
SetKillingNode( true );
rc = StopProcesses();
if ( rc == 0 )
{
// Stop monitoring the monitor process
ProcessMonitor->SetCheckMonitor( false );
StopMonitorProcess();
}
}
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Node down processing complete! (rc=%d)\n", method_name, __LINE__, rc);
}
TRACE_EXIT;
}
void CSdTimer::ResetSoftdogTimer( struct timespec &timeout )
{
const char method_name[] = "CSdTimer::ResetSoftdogTimer";
TRACE_ENTRY;
if ( IsSoftdogEnabled() )
{
clock_gettime(CLOCK_REALTIME, &expiredTime_);
expiredTime_.tv_sec += sdtKeepAliveTimerValue_;
timeout = expiredTime_;
SetState( SDT_FAIL );
}
TRACE_EXIT;
}
bool CSdTimer::CheckMonitorRefresh()
{
const char method_name[] = "CSdTimer::CheckMonitorRefresh";
TRACE_ENTRY;
bool result = true; // assume refreshed
int monRefreshCtr = gp_local_mon_io->getLastMonRefresh();
// the current refresh counter has to be greater than or equal to the saved refresh counter
assert(monRefreshCtr >= sdtLastMonRefreshCtr_);
if (monRefreshCtr == sdtLastMonRefreshCtr_)
{ // no increment from last time
result = false;
}
else
{
result = true;
sdtLastMonRefreshCtr_ = monRefreshCtr;
}
TRACE_EXIT;
return result;
}
void CSdTimer::SetTimeToWakeUp( struct timespec &ts )
{
const char method_name[] = "CSdTimer::SetTimeToWakeUp";
TRACE_ENTRY;
clock_gettime(CLOCK_REALTIME, &ts);
ts.tv_sec += sdtKeepAliveTimerValue_;
TRACE_EXIT;
}
void CSdTimer::SoftdogTimer( void )
{
const char method_name[] = "CSdTimer::SoftdogTimer";
TRACE_ENTRY;
int rc;
struct timespec timeout;
bool timerExpired = false;
SetTimeToWakeUp( timeout );
// until there is an exit event from the monitor or timer expires
while ( GetState() != SDT_EXIT )
{
lock();
if ( !IsSoftdogEnabled() )
{
// Wait until timer started
CLock::wait();
}
else
{
// Wait until signaled or timer expires
rc = CLock::timedWait( &timeout );
if ( rc == ETIMEDOUT )
{
timerExpired = true;
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Softdog Timer refresh expired" "\n", method_name, __LINE__);
}
if ( GetState() != SDT_FAIL && GetState() != SDT_EXIT )
{
SetState( SDT_FAIL );
}
}
else
{
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Softdog Timer not expired, state_=%d\n", method_name, __LINE__, GetState());
}
}
}
switch ( GetState() )
{
case SDT_START:
StartSoftdogTimer();
SetTimeToWakeUp( timeout );
break;
case SDT_FAIL:
if ( timerExpired )
{
if ( CheckMonitorRefresh() || IsMonitorInDebug() )
{
ResetSoftdogTimer( timeout );
}
else
{
DumpMonitorProcess();
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf), "Node %d going down, "
"failed to get refresh event from monitor\n",
gv_ms_su_nid);
genSnmpTrap( buf );
NodeFailSafe( timerExpired );
StopSoftdogTimer();
Watchdog->SetNodeDown();
Watchdog->CLock::wakeOne();
}
}
break;
case SDT_EXPIRE:
NodeFailSafe( timerExpired );
StopSoftdogTimer();
Watchdog->SetNodeDown();
Watchdog->CLock::wakeOne();
break;
case SDT_SHUTDOWN:
NodeFailSafe( timerExpired, true /*shutdown*/ );
StopSoftdogTimer();
Watchdog->SetNodeDown();
Watchdog->CLock::wakeOne();
break;
case SDT_RESET:
ResetSoftdogTimer( timeout );
break;
case SDT_STOP:
StopSoftdogTimer();
break;
default:
break;
}
unlock();
}
TRACE_EXIT;
}
void CSdTimer::StartSoftdogTimer( void )
{
const char method_name[] = "CSdTimer::StartSoftdogTimer";
TRACE_ENTRY;
if ( !IsSoftdogEnabled() )
{
if (trace_settings & TRACE_REQUEST)
{
trace_printf( "%s@%d Timer started!\n", method_name, __LINE__ );
}
clock_gettime(CLOCK_REALTIME, &expiredTime_);
expiredTime_.tv_sec += sdtKeepAliveTimerValue_;
SetSoftdog( true );
SetState( SDT_FAIL );
}
TRACE_EXIT;
}
void CSdTimer::StopSoftdogTimer( void )
{
const char method_name[] = "CSdTimer::StopSoftdogTimer";
TRACE_ENTRY;
if ( IsSoftdogEnabled() )
{
if (trace_settings & TRACE_REQUEST)
{
trace_printf( "%s@%d Timer stopped!\n", method_name, __LINE__ );
}
SetSoftdog( false );
SetState( SDT_DISABLED );
}
TRACE_EXIT;
}
int CSdTimer::StartWorker( void )
{
const char method_name[] = "CSdTimer::StartWorker";
TRACE_ENTRY;
int rc = pthread_create( &threadId_, NULL, SoftdogThread, this );
if (rc != 0)
{
char la_buf[MON_STRING_BUF_SIZE];
int err = rc;
sprintf(la_buf, "[%s], Error= Can't create thread! - errno=%d (%s)\n", method_name, err, strerror(err));
monproc_log_write( MON_SDTIMER_STARTWORKER_1, SQ_LOG_ERR, la_buf);
TRACE_EXIT;
return( rc );
}
if (trace_settings & TRACE_INIT)
{
trace_printf("%s@%d" " SoftdogThread created, threadId=%lx" "\n", method_name, __LINE__, threadId_);
}
TRACE_EXIT;
return( rc );
}
int CSdTimer::StopProcesses( void )
{
const char method_name[] = "CSdTimer::StopProcesses";
TRACE_ENTRY;
CPKillAll pKillAll( "pkillall" );
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Killing all processes!\n", method_name, __LINE__);
}
// save, close and restore stdin
int savedStdIn = dup(STDIN_FILENO);
close(STDIN_FILENO);
// kill all processes
int rc = pKillAll.ExecuteCommand( "-safekill" );
if ( rc == -1 )
{
char la_buf[MON_STRING_BUF_SIZE];
sprintf(la_buf, "[%s], Error= Can't execute 'pkillall -safekill' command!\n", method_name);
monproc_log_write( MON_SDTIMER_STOPPROCESSES_1, SQ_LOG_ERR, la_buf);
dup2(savedStdIn, STDIN_FILENO);
close(savedStdIn);
TRACE_EXIT;
return( rc );
}
dup2(savedStdIn, STDIN_FILENO);
close(savedStdIn);
TRACE_EXIT;
return( rc );
}
int CSdTimer::StopMonitorProcess( void )
{
const char method_name[] = "CSdTimer::StopMonitorProcess";
TRACE_ENTRY;
int monPid = gp_local_mon_io->get_monitor_pid();
int rc = kill( monPid, SIGKILL );
if ( rc == -1 && errno == ESRCH)
{
if ( errno != ESRCH )
{
char buf[MON_STRING_BUF_SIZE];
int err = rc;
sprintf(buf, "[%s], Error= Can't kill monitor process! - errno=%d (%s)\n", method_name, err, strerror(err));
monproc_log_write(MON_SDTIMER_STOPMONITORPROC_1, SQ_LOG_ERR, buf);
}
else
{
rc = 0; // It's already dead
}
}
TRACE_EXIT;
return( rc );
}
int CSdTimer::SuspendMonitorProcess( void )
{
const char method_name[] = "CSdTimer::SuspendMonitorProcess";
TRACE_ENTRY;
int monPid = gp_local_mon_io->get_monitor_pid();
if (trace_settings & TRACE_REQUEST)
{
trace_printf("%s@%d" " - Suspending monitor process, pid=%d\n", method_name, __LINE__, monPid);
}
int rc = kill( monPid, SIGUSR2 );
if ( rc == -1 && errno == ESRCH)
{
if ( errno != ESRCH )
{
char buf[MON_STRING_BUF_SIZE];
int err = rc;
sprintf(buf, "[%s], Error= Can't signal monitor process! - errno=%d (%s)\n", method_name, err, strerror(err));
monproc_log_write(MON_SDTIMER_SUSPENDMONITORPROC_1, SQ_LOG_ERR, buf);
}
else
{
rc = 0; // It's dead
}
}
TRACE_EXIT;
return( rc );
}
int CSdTimer::ShutdownWork( void )
{
const char method_name[] = "CSdTimer::ShutdownWork";
TRACE_ENTRY;
int rc;
while ( IsSoftdogEnabled() )
{
StopSoftdogTimer();
char buf[MON_STRING_BUF_SIZE];
sprintf(buf, "[%s], Watchdog process timer stopped!\n", method_name);
monproc_log_write(MON_SDTIMER_SHUTDOWNWORK_1, SQ_LOG_INFO, buf);
}
// Wake up Softdog thread to exit.
SetState( SDT_EXIT );
CLock::wakeOne();
if (trace_settings & TRACE_INIT)
trace_printf( "%s@%d waiting for Softdog check thread=%lx to exit.\n",
method_name, __LINE__, threadId_ );
// Wait for Softdog thread to exit
if ((rc = pthread_join(threadId_, NULL)) != 0)
{
char buf[MON_STRING_BUF_SIZE];
int err = rc;
sprintf(buf, "[%s], Error= Can't join thread! - errno=%d (%s)\n", method_name, err, strerror(err));
monproc_log_write(MON_SDTIMER_SHUTDOWNWORK_2, SQ_LOG_ERR, buf);
}
TRACE_EXIT;
return( rc );
}