blob: cd55a4941a31d3648adb83aa65f126a384bee2ad [file] [log] [blame]
#!/bin/bash
#
# @@@ START COPYRIGHT @@@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# @@@ END COPYRIGHT @@@
#
#SQ Startup Driver Script. Use sqgen prior to this to generate various files required for startup.
function LogIt {
echo "`date`: $1" >> $TRAF_HOME/logs/startup.log
}
function Usage {
echo
echo "Usage: $script_name { -c } -x"
echo
echo "-c Perform a Cold Start"
echo "-x Perform Extra Checks (chk_dbperms) before allowing startup"
echo
exit 1;
}
function hms
{
seconds=$elapsed_seconds
hours=$((seconds / 3600))
seconds=$((seconds % 3600))
minutes=$((seconds / 60))
seconds=$((seconds % 60))
echo " $hours hour(s) $minutes minute(s) $seconds second(s)"
}
function GetOpts {
while getopts "chx" arg
do
case $arg in
c)
sq_start=c;
;;
x)
run_chk_dbperms=1;
;;
h)
Usage;
;;
*)
Usage;
;;
esac
done
}
function chkReturnCode {
if [[ $2 != 0 ]]; then
LogIt "End $script_name $sq_start, exit code: $2"
echo "$1 returned error $2, Exiting..."
exit $2;
fi
}
function echoLog {
echo $1 | tee -a $SQMON_LOG
}
function gomonColdStart {
monreg=$MPI_TMPDIR/monitor.registry.*.`uname -n`
echoLog "Forcing a Cold Start: rm $monreg"
rm -f $monreg 2>&1
echoLog "Performing a Cold Start"
SQSCRIPT_FILE=$COLD_STARTFILE
}
function gomonColdRecoveryStart {
SQSCRIPT_FILE=$COLD_STARTFILE
}
function CheckMonitorLink {
if [ ! -e monitor ]; then
if [ -h monitor ]; then
echoLog "The monitor link $TRAF_HOME/sql/scripts/monitor points to a non-existent target:"
ls -al monitor
else
echoLog "A soft link to 'monitor' is missing in $TRAF_HOME/sql/scripts"
fi
echoLog "Aborting startup."
LogIt "End $script_name $sq_start, exit code: 1"
exit 1
fi
}
# Sets the variable gv_job_pid with the pid of the job (passed as an argument)
function getJobInfo {
if [ -z "$1" ]; then
exit 1
fi
lv_job=$1
gv_job_pid=""
lv_jobinfo=`jobs -l $lv_job`
if [ ! -z "$lv_jobinfo" ]; then
let lv_count=0
for lv_item in $lv_jobinfo; do
if [ $lv_count '==' 1 ]; then
gv_job_pid=$lv_item
fi
let ++lv_count
done
fi
}
function SQStartProcesses {
echoLog "Starting the SQ Environment (Executing $SQSCRIPT_FILE)"
echoLog
$SQSCRIPT_FILE > $SQMON_LOG 2>&1
sqstart_stat=$?
echo
if [[ $sqstart_stat == 0 ]]; then
echoLog "SQ Startup script ($SQSCRIPT_FILE) ran successfully. Performing further checks..."
echo "0" > $SQSTART_EXIT_STATUS
else
echoLog "Error while executing the startup script!!!"
sqcheck -f
sq_stat=$?
if [[ $sq_stat == 0 ]]; then
echoLog "Even though all the SQ processes started up, the startup did not succeed!"
fi
echoLog
echoLog "Please check the SQ shell log file : $SQMON_LOG"
echoLog
echoLog "SQ Startup (from $PWD) Failed"
echoLog
echo "1" > $SQSTART_EXIT_STATUS
fi
}
function SQCheckOrphanProcesses {
let lv_orphans_exist=0
let lv_orphan_process_count=0
let lv_sqcop_done=0
let lv_num_checks=0
let lv_max_checks=10
echo -n "Checking orphan processes"
while [ $lv_sqcop_done '==' 0 ]; do
echo -n "."
lv_orphan_process_count=`cstat -h | wc -l`
let lv_orphans_exist=(lv_orphan_process_count '>' 2)
if [ $lv_orphans_exist '==' 1 ]; then
let lv_actual_orphan_count=(lv_orphan_process_count '-' 2)
echo -ne "\rChecking orphan processes: $lv_actual_orphan_count"
let ++lv_num_checks
let lv_sqcop_done=(lv_num_checks '>' lv_max_checks)
if [ $lv_sqcop_done '==' 0 ]; then
sleep 5
fi
else
let lv_sqcop_done=1
fi
done
echo
if [ $lv_orphans_exist '==' 1 ]; then
lv_msg1="There are orphan processes from a previous SQ instance."
echo "$lv_msg1"
cstat
echo
lv_msg2="SQ startup has not been initiated. Exiting..."
echo "$lv_msg"
LogIt "End $script_name, exit code: 1, $lv_msg1 $lv_msg2"
exit 1
fi
}
function checkKerberos {
# Check to see if kerberos is enabled in Hadoop
cat /etc/hadoop/conf/core-site.xml | while read a; do
found=`echo $a | grep "hadoop.security.authentication" | wc -l`
if [[ $found -eq 1 ]]; then
read b
enabled=`echo $b | grep kerberos | wc -l`
if [[ $enabled -eq 1 ]]; then
exit 1
else
exit 0
fi
fi
done
retcode=$?
if [[ $retcode -eq 1 ]]; then
# Check to see if TGT exists (ticket granting ticket)
tktExists=`krb5service status | grep "Ticket information not found" | wc -l`
return $tktExists
fi
return 0
}
function checkUlimit {
#check max_pid
userProc=`cat /proc/sys/kernel/pid_max`
if [[ $userProc -gt 65535 ]] ; then
echoLog ""
echoLog "ERROR: issue detected during startup. Please make sure your kernel parameter (sysctl) is set up correctly."
echoLog "ERROR: max_pid should not be greater than 65535"
echoLog ""
return
fi
#check ulimit -l
maxLockMem=`ulimit -l`
if [[ $maxLockMem -lt 65536 ]] ; then
if [[ "$maxLockMem" -ne "unlimited" ]]; then
echoLog ""
echoLog "ERROR: issue detected during startup. Please make sure your ulimit -l is set up correctly."
echoLog "ERROR: max locked memory is smaller than 64M"
echoLog ""
return
fi
fi
#check ulimit -l
if [[ $maxLockMem -lt 327680 ]] ; then
if [[ "$maxLockMem" -ne "unlimited" ]]; then
echoLog ""
echoLog "WARNING: issue detected during startup, please make sure your ulimit -l is set up correctly."
echoLog "WARNING: max locked memory is smaller than 320M "
echoLog ""
return
fi
fi
}
#########################################################
# MAIN portion of sqstart begins here
#########################################################
script_name=`/bin/basename $0`
sq_start=$SQ_STARTUP;
run_chk_dbperms=0
STARTtime=$(date +%s)
GetOpts $BASH_ARGV
if [ ! -z $TRAF_HOME ]; then
cd $TRAF_HOME/sql/scripts
else
echoLog
echoLog "The TRAF_HOME environment variable does not exist."
echoLog "Please ensure sqenv.sh has been sourced, and re-run $script_name."
echoLog
exit 1;
fi
checkUlimit
# Check SeaMonster kernel module if SeaMonster is enabled
if [[ $SQ_SEAMONSTER == "1" ]]; then
echoLog
echoLog "SeaMonster is enabled, checking SeaMonster Kernel Module installed version..."
echoLog
sminfo_out=`$TRAF_HOME/export/bin${SQ_MBTYPE}/sminfo 2>&1`
if [ $? -gt 0 ]; then
echoLog "$sminfo_out"
lv_msg="Correct SeaMonster Kernel Module in not installed; please resolve this issue and re-run $script_name!"
echo "$lv_msg"
LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
exit 1
fi
echoLog "$sminfo_out"
LogIt "$sminfo_out"
echoLog
fi
if [[ ! -e $TRAF_HOME/sql/scripts/sw_env.sh ]]; then
checkKerberos
if [[ $? -ne 0 ]]; then
echo
echo "Kerberos is enabled on the system but cannot find a valid TGT (ticket granting ticket) for the Trafodion ID"
echo "Please run trafodion_secure_install to enable Kerberos for Trafodion"
echo
exit -1
fi
fi
declare -i lv_len_mysqroot
declare -i lv_max_mysqroot_len
lv_max_mysqroot_len=78
lv_len_mysqroot=${#TRAF_HOME}
if [ $lv_len_mysqroot '>' $lv_max_mysqroot_len ]; then
lv_msg="Length of the string \$TRAF_HOME ($TRAF_HOME) should be less than $lv_max_mysqroot_len. Exiting...";
echoLog "$lv_msg"
LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
exit 1;
fi
LogIt "Begin $script_name $sq_start"
sqcheck -i 1 -d 1 > /dev/null 2>&1
sq_stat=$?
if [[ $sq_stat == 0 ]]; then
lv_msg="SQ environment is already up."
echoLog "$lv_msg"
LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
exit 1
elif [[ $sq_stat == 1 ]]; then
lv_msg="SQ environment is partially up."
echoLog "$lv_msg"
LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
exit 1
fi
SQCheckOrphanProcesses
if [[ $run_chk_dbperms == 1 ]]; then
chk_dbperms=$TRAF_HOME/sql/scripts/chk_dbperms
if [ -x $chk_dbperms ]; then
echo
echo "Checking ownership and permissions."
echo -n "This will take a few minutes..."
$chk_dbperms --busy
chk_res=$?
echo
if [ $chk_res -gt 0 ]; then
lv_msg="Ownership and/or Permissions issues found.. aborting."
echoLog "$lv_msg"
LogIt "End $script_name $sq_start, exit code: $chk_res, $lv_msg"
exit 1
fi
echo
fi
fi
SQLOG_DIR=$TRAF_HOME/logs
SQMON_LOG=$SQLOG_DIR/sqmon.log
SQSTART_EXIT_STATUS=./sqstart.exit_status
if [[ -z $SQSCRIPTS_DIR ]]; then
SQSCRIPTS_DIR=$TRAF_HOME/sql/scripts
fi
mkdir -p $SQLOG_DIR
mkdir -p $MPI_TMPDIR
if [ ! -d $MPI_TMPDIR ]; then
lv_msg="Error: The MPI_TMPDIR: $MPI_TMPDIR does not exist. Exiting."
echoLog "$lv_msg"
LogIt "End $script_name $sqstart, exit code: 1, $lv_msg"
exit 1
fi
if [ ! -w $MPI_TMPDIR ]; then
lv_msg="Error: The MPI_TMPDIR: $MPI_TMPDIR is not writable by $USER. Exiting."
echoLog "$lv_msg"
LogIt "End $script_name $sqstart, exit code: 1, $lv_msg"
exit 1
fi
COLD_STARTFILE=$SQSCRIPTS_DIR/gomon.cold
# If -c option is set, start based on the option supplied
if [[ $sq_start == "c" ]]; then
gomonColdStart
else
gomonColdRecoveryStart
fi
echoLog "Removing old mpijob* files from $MPI_TMPDIR"
echoLog ""
rm -f $MPI_TMPDIR/mpijob*
echoLog "Removing old monitor.port* files from $MPI_TMPDIR"
echoLog ""
rm -f $MPI_TMPDIR/monitor.port.*
setup_sqpdsh
# Clean HBase classpath cache file
echo "Clean up HBase classpath cache file: $TRAF_VAR/hbase_classpath"
$SQPDSHA "rm -rf $TRAF_VAR/hbase_classpath"
# Clear unique strings
$SQPDSHA "cd $TRAF_HOME/sql/scripts; utilConfigDb -u"
echoLog "Executing sqipcrm (output to sqipcrm.out)"
sqipcrm > sqipcrm.out
echoLog "Executing cleanZKNodes (output to cleanZKNodes.out)"
cleanZKNodes > cleanZKNodes.out 2>&1
echoLog "Checking whether HBase is up"
hbcheck 4 30
if [ $? != 0 ]; then
lv_msg="It looks like HBase is not up, hbcheck utility failed."
echoLog "$lv_msg"
LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
exit 1
fi
rm -f $SQSTART_EXIT_STATUS
SQStartProcesses &
getJobInfo SQStartProcesses
if [ ! -z "$gv_job_pid" ]; then
lv_sqstartup_job_pid=$gv_job_pid
echoLog "Background SQ Startup job (pid: $lv_sqstartup_job_pid)"
else
lv_msg="Some problem with the initiation of the job to startup SQ processes"
echoLog "$lv_msg"
LogIt "End $script_name $sqstart, exit code: 3, $lv_msg"
exit 3
fi
sleep 5
declare -i lv_recovery_tx_cnt
declare -i lv_startup_status
declare -i lv_num_checks
declare -i lv_max_checks
declare -i lv_done
declare -i lv_process_count_curr
declare -i lv_process_count_last
declare -i lv_TM0_started
declare -i lv_max_iterations
declare -i lv_num_iterations
declare -i lv_sleep_per_iteration
let lv_recovery_tx_cnt=-1
let lv_startup_status=1
let lv_num_checks=0
#Long recoveries on a cluster could delay TX services
let lv_max_checks=600
# Check if resource is a work station
if [[ -e ${TRAF_HOME}/etc/ms.env ]] ; then
VIRT_NODES=`awk '/SQ_VIRTUAL_NODES=/ { fields=split($0,virt,"="); if ( fields == 2 ) { virtnodes=virt[2];}} END {print virtnodes}' < $TRAF_HOME/etc/ms.env`
if [[ -n "$VIRT_NODES" ]] ; then
#Some utilities on workstations will timeout if we wait too long
lv_max_checks=360
fi
fi
let lv_done=0
let lv_process_count_curr=0
let lv_process_count_last=0
let lv_TM0_started=0
let lv_tm_svc_ready=0
let lv_max_iterations=5
let lv_num_iterations=0
let lv_sleep_per_iteration=5
let tx_srvc_ready_printed=0
while [ $lv_done '==' 0 ];
do
if [ -e $SQSTART_EXIT_STATUS ]; then
lv_startup_status=`cat $SQSTART_EXIT_STATUS`
let lv_done=1
else
lv_process_count_curr=`cstat 2>/dev/null | grep -v "\-\-\- \-\-\-\-" | grep -v 'pid ppid' | wc -l`
let lv_cnt_check=(lv_process_count_curr '>' lv_process_count_last)
if [ $lv_cnt_check '==' 1 ]; then
let lv_num_checks=0
fi
lv_process_count_last=lv_process_count_curr
# lv_TM0_started=`cstat 2>/dev/null | grep -v "\-\-\- \-\-\-\-" | grep -v 'pid ppid' | grep TM | wc -l`
# if [ $lv_TM0_started '!=' 0 ]; then
# let lv_num_checks=0
# lv_recovery_tx_cnt=`sqshell -c show | grep -v 'Configuration Change Notice' | grep DTM_RECOVERING_TX_COUNT | cut -d= -f2 | sed -e 's/^[[:space:]]*//'`
# if [ "$lv_recovery_tx_cnt" -gt "0" ]; then
# echo -ne "\r# of Transactions being recovered: $lv_recovery_tx_cnt "
# else
# if [ "$lv_tm_svc_ready" -eq 0 ]; then
# echo -ne "\r# of Transactions being recovered: $lv_recovery_tx_cnt "
# sqregck -r SQ_TXNSVC_READY -d 5 -i 5
# lv_tm_svc_ready=$?
# else
# if [ $tx_srvc_ready_printed -eq 0 ]; then
# echo -ne "\rTransaction Services Ready "
# tx_srvc_ready_printed=1
# fi
# let lv_tm_svc_ready=1
# fi
# fi
# else
if [ $lv_num_checks '==' 0 ]; then
echo -ne "\r# of SQ processes: $lv_process_count_last "
else
echo -n "."
fi
# fi
getJobInfo SQStartProcesses
if [ -z "$gv_job_pid" ]; then
if [ ! -e $SQSTART_EXIT_STATUS ]; then
echoLog "The background SQ job (pid: $lv_sqstartup_job_pid) that was starting up the SQ processes exitted abnormally."
echoLog "There are still $lv_process_count_curr SQ processes that exist in the environment."
echoLog "Exiting."
let lv_done=1
lv_startup_status=3
break
fi
fi
let ++lv_num_checks
if [[ $((lv_num_checks % 100)) -eq 0 ]]; then
date; sqcheck -f
fi
let lv_done=(lv_num_checks '>' lv_max_checks)
if [ $lv_done '==' 1 ]; then
lv_startup_status=2
echoLog ""
echoLog "SQ Startup is taking longer than usual and seems to be stalled at $lv_process_count_curr processes. "
echoLog "The background SQ job (pid: $lv_sqstartup_job_pid) that is starting up the SQ processes is still running."
echoLog "Exiting."
fi
sleep 10
fi
done
if [ $lv_startup_status '==' 0 ]; then
# Minimize the wait time as much as possible by regularly checking,
# but don't wait forever either
let lv_done=0
while [[ $lv_done -eq 0 ]] && [[ $lv_num_iterations -lt $lv_max_iterations ]];
do
sleep $lv_sleep_per_iteration
# sqcheck will return:
# -1 - Not up ($?=255)
# 0 - Fully up and operational
# 1 - Partially up and operational
# 2 - Partially up and NOT operational
# On a return of 1, we loop for a while to let it come fully up
sqcheck > /dev/null 2>&1
sq_stat=$?
if ([ $sq_stat '!=' 1 ]); then
let lv_done=1
else
if [ $lv_num_iterations '==' 0 ]; then
echo "The SQ environment is partially up! Continuing checks."
echo -n "Checking"
fi
echo -n "."
let ++lv_num_iterations
fi
done
if [ $lv_num_iterations -eq $lv_max_iterations ]; then
echo
fi
# Official sqcheck that will be displayed
sqcheck | tee -a $SQMON_LOG
echoLog "`date`"
fi
# Start the DCS Service
dcsstart
sleep 10
sqcheck | tee -a $SQMON_LOG
# Start the LOB Service
#comment out since it's unused right now
#lobstart
# Start the REST Service
reststart
echo
dcscheck | tee -a $SQMON_LOG
echo
echo "You can monitor the SQ shell log file : $SQMON_LOG"
echo
#check ulimit again to add it into log
checkUlimit
echo
STOPtime=$(date +%s)
elapsed_seconds=$(( $STOPtime - $STARTtime))
printf "Startup time "
hms "$elapsed_seconds"
if [ $lv_startup_status '!=' 0 ]; then
LogIt "Please check the log files in $SQLOG_DIR for errors."
fi
LogIt "End $script_name $sq_start, exit code: ${lv_startup_status}"
exit ${lv_startup_status}