blob: f99118568394647e9177ec75a41ea3ffc1c91863 [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This is a resource agent for controlling hadoop daemons from
# cluster.
# Source function library
. /etc/init.d/functions
# OCF_ROOT is ./usr/lib/ocf
: ${OCF_FUNCTIONS_DIR=$(dirname $0)}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
# Source networking configuration
[ -f /etc/sysconfig/network ] && . /etc/sysconfig/network
# Check that networking is up
[ "${NETWORKING}" = "no" ] && exit ${OCF_ERR_INSTALLED}
# Pull in Hadoop facts
. /etc/default/hadoop
. /etc/hadoop/conf/hadoop-env.sh
if [ "${OCF_RESKEY_daemon}" == "namenode" ]; then
user="${HADOOP_NAMENODE_USER}"
else
user="${HADOOP_JOBTRACKER_USER}"
fi
# The program being managed
program=hadoop-daemon.sh
DAEMON=${HADOOP_HOME}/bin/$program
# the HA probe script
HAPROBE=${HADOOP_HOME}/monitor/haprobe.sh
#This isn't in the 5x so here is a rewrite of the core operations
# Input: a command and arguments
# out: 0 or OCF_ERR_GENERIC.
ocf_run() {
out=`"$@" 2>&1`
#`"$@"`
retval=$?
if ((${retval} == 0))
then
ocf_log info ${out}
else
echo $out
ocf_log err ${out}
ocf_log err "Command $* failed with return code ${retval}"
retval=${OCF_ERR_GENERIC}
fi
return ${retval};
}
#this is here as the ocf command is missing
ocf_is_decimal () {
let i=10#$1 2>/dev/null;
}
# Generate the metadata about this cluster entry
# IMPORTANT WARNING FOR PEOPLE MAINTAINING THIS
# NO NOT PUT ANY QUOTES IN THE DESCRIPTION TEXT.
# --THESE ARE CONVERTED INTO ATTRIBUTES FOR SCHEMA VALIDATION; QUOTES BREAKS THIS
#
metadata() {
cat <<EOT
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1-modified.dtd">
<resource-agent version="rgmanager 2.0" name="hadoop">
<version>1.0</version>
<longdesc lang="en">
Apache Hadoop resource agent
</longdesc>
<shortdesc lang="en">
hadoop resource agent
</shortdesc>
<parameters>
<parameter name="name" unique="1" primary="1">
<shortdesc lang="en">
Symbolic name for this hadoop service
</shortdesc>
<longdesc lang="en">
Symbolic name for this hadoop service, e.g. NameNode Process
</longdesc>
<content type="string"/>
</parameter>
<parameter name="daemon" unique="0" required="1">
<shortdesc lang="en">
The hadoop daemon name to run
</shortdesc>
<longdesc lang="en">
The hadoop daemon name to run, e.g. namenode
</longdesc>
<content type="string"/>
</parameter>
<parameter name="ambariproperties" unique="0" required="0">
<shortdesc lang="en">
Ambari properties as comma separated key value pairs
</shortdesc>
<longdesc lang="en">
Example property value:
ambariproperties="server=localhost,port=8080,protocol=http,user=admin,password=admin,cluster=c1,output=/var/log/ambari_relocate.log"
</longdesc>
<content type="string"/>
</parameter>
<parameter name="url" unique="0" required="0">
<shortdesc lang="en">
URL to probe, use empty string or null to indicate undefined
</shortdesc>
<longdesc lang="en">
URL to probe, use empty string or null to indicate undefined
</longdesc>
<content type="string"/>
</parameter>
<parameter name="pid" unique="0" required="0">
<shortdesc lang="en">
The filename of any .pid file to monitor.
</shortdesc>
<longdesc lang="en">
The filename of any .pid file identifying a process to monitor.
This is of little benefit when monitoring a live cluster, as the HTTP and IPC
probes are more rigorous. Probing the process by pay of the pid file
is most useful during startup, as it can detect the failure of a process
early.
</longdesc>
<content type="string"/>
</parameter>
<parameter name="path" unique="0" required="0">
<shortdesc lang="en">
The directory path in HDFS to probe
</shortdesc>
<longdesc lang="en">
The path in the HDFS filesystem to probe; default is "/"
</longdesc>
<content type="string"/>
</parameter>
<parameter name="boottime" unique="0" required="0">
<shortdesc lang="en">
The time in milliseconds that the service is required to be live by.
</shortdesc>
<longdesc lang="en">
The time in milliseconds that the service is required to be live by.
For the Namenode, this includes the time to replay the edit log.
</longdesc>
<content type="integer" default="180000"/>
</parameter>
<parameter name="probetime" unique="0" required="0">
<shortdesc lang="en">
The time in milliseconds that a probe should take.
</shortdesc>
<longdesc lang="en">
The maximum time in milliseconds that a probe should take. This must be
long enough to cover GC pauses, so that a long GC does not get mistaken
for a hung process.
</longdesc>
<content type="integer" default="120000"/>
</parameter>
<parameter name="stoptime" unique="0" required="0">
<shortdesc lang="en">
The time in milliseconds that the service is required to be stop gracefully by.
</shortdesc>
<longdesc lang="en">
The time in milliseconds that the service is required to to come to
a clean halt.
If the process has not finished by the end of this time period, it
is forcefully killed via a kill-9 command.
</longdesc>
<content type="integer" default="60000"/>
</parameter>
<parameter name="waitfs" unique="0" required="0">
<shortdesc lang="en">
flag to indicate whether or not the filesystem needs to come up first
</shortdesc>
<longdesc lang="en">
Indicate that the HA monitor should wait until the fs is live before
declaring that the service is live
</longdesc>
<content type="boolean" default="false"/>
</parameter>
</parameters>
<actions>
<!-- start time doesnt provide a timeout hint as waitfs actions
may need to block startup for an extended period of time. -->
<action name="start" />
<action name="stop" timeout="100s"/>
<!-- includes shutdown time and edit log time -->
<action name="recover" timeout="4m"/>
<!-- Regular status check -->
<action name="monitor" interval="20s" timeout="120s"/>
<action name="status" interval="20s" timeout="120s"/>
<!-- Depth checks -->
<!-- This depth checks hdfs is accessible -->
<!-- <action name="monitor" depth="10" interval="30s" timeout="120s"/> -->
<!-- <action name="status" depth="10" interval="30s" timeout="120s"/> -->
<action name="meta-data" timeout="5s"/>
<action name="validate-all" timeout="5s"/>
</actions>
</resource-agent>
EOT
}
#If you want to test the scripts, set some properties
# export OCF_RESKEY_httpport="50070"
# export OCF_RESKEY_daemon="namenode"
# export OCF_RESKEY_ip="localhost"
# export OCF_CHECK_LEVEL="100"
# Start the operation
start() {
assert_binary
ocf_log info "Starting hadoop-${OCF_RESKEY_daemon}"
daemon --user ${user} --check ${DAEMON} ${DAEMON} --config /etc/hadoop/conf start ${OCF_RESKEY_daemon}
RETVAL=$?
if [ ${RETVAL} -ne 0 ]; then
ocf_log err "Failed to start ${DAEMON}: ${RETVAL}"
return ${RETVAL}
fi
sleep 15
dfs_bootstrap_check
RETVAL=$?
echo
if [ ${RETVAL} -ne 0 ]; then
ocf_log err "Failed to start dfs_bootstrap_check}: ${RETVAL}"
return ${OCF_ERR_GENERIC}
fi
return 0
}
stop() {
HADOOP_STOP_TIMEOUT=${OCF_RESKEY_stoptime}
ocf_log info "Stopping hadoop-${OCF_RESKEY_daemon} with timeout ${HADOOP_STOP_TIMEOUT}"
daemon --user ${user} --check ${DAEMON} ${DAEMON} --config /etc/hadoop/conf stop ${OCF_RESKEY_daemon}
RETVAL=$?
ocf_log info "stop command issued, retval is ${RETVAL}"
if [ ${RETVAL} -ne 0 ]; then
ocf_log err "Failed to stop ${program} ${OCF_RESKEY_daemon}"
return ${OCF_ERR_GENERIC}
fi
echo
# Spin waiting for shutdown
# while url_check
# do
# ocf_log debug "Resource has not stopped yet, waiting"
# sleep 15
# done
#
return ${OCF_SUCCESS}
}
#
# Verify the binary is installed
#
# Usage: verify_binary
# Result: $OCF_ERR_INSTALLED = binary not installed
# 0 = binary installed
#
verify_binary() {
# Report that $prog does not exist, or is not executable
if [ ! -x "${DAEMON}" ]; then
ocf_log err "Binary ${DAEMON} doesn't exist"
return ${OCF_ERR_INSTALLED}
fi
return ${OCF_SUCCESS}
}
assert_binary() {
verify_binary || exit $?
}
# status checking.
# This exits during its execution, as this simplifies
# the logic for different layers of check
status_check() {
# assert_arguments_are_valid
ocf_log info "Checking ${OCF_RESKEY_daemon}, Level ${OCF_CHECK_LEVEL}"
#look for the check level as in some tests it isn't set
if [ "x" == "x${OCF_CHECK_LEVEL}" ]
then
ocf_log err "Environment variable OCF_CHECK_LEVEL not set"
exit ${OCF_ERR_ARGS}
fi
retval=0
# website check
# url_check
# retval=$?
# retval=pid_check
# if [ $retval -ne 0 ]
# then
# exit ${retval}
# fi
#
# [ "${OCF_CHECK_LEVEL}" -lt 10 ] && exit ${retval}
# Depth level 10 check
dfs_check
retval=$?
#
# if [ $? -ne 0 ]; then
# retval=${OCF_NOT_RUNNING}
# fi
exit ${retval}
}
# HA probe
dfs_check() {
ocf_run "${HAPROBE}" --file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} --timeout ${OCF_RESKEY_probetime}
if [ $? -ne 0 ]
then
ocf_log warn "Service ${OCF_RESKEY_daemon} is not running according to checks: -file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} "
return ${OCF_NOT_RUNNING}
fi
return ${OCF_SUCCESS}
}
# Run a bootstrap check
# this can include different probes and timeouts
dfs_bootstrap_check() {
ocf_run "${HAPROBE}" --file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} --timeout ${OCF_RESKEY_probetime} --boottimeout ${OCF_RESKEY_boottime} --waitfs ${OCF_RESKEY_waitfs}
if [ $? -ne 0 ]
then
ocf_log warn "Service ${OCF_RESKEY_daemon} is not booting according to checks: -file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} "
return ${OCF_NOT_RUNNING}
fi
return ${OCF_SUCCESS}
}
# this is a PID check
pid_check() {
ocf_run "${HAPROBE} --pid ${OCF_RESKEY_pid}"
if [ $? -ne 0 ]; then
return ${OCF_NOT_RUNNING}
fi
return ${OCF_SUCCESS}
}
# fill in the default values of a service
fill_in_defaults() {
: ${OCF_RESKEY_boottime="180000"}
: ${OCF_RESKEY_daemon="namenode"}
: ${OCF_RESKEY_httpport="50070"}
: ${OCF_RESKEY_ip="localhost"}
: ${OCF_RESKEY_path="/"}
: ${OCF_RESKEY_pid="null"}
: ${OCF_RESKEY_probetime="120000"}
: ${OCF_RESKEY_stoptime="60000"}
: ${OCF_RESKEY_url="http://localhost:50070/"}
: ${OCF_RESKEY_waitfs="false"}
}
dump_environment() {
ocf_log info `env`
}
# Relocate Ambari managed master to current host on failover
execute_ambari_relocate_probe() {
retval = parse_and_validate_ambari_properties
if [ $retval -eq 2 ] ; then
return 0
elif [ $retval -eq 1 ] ; then
exit ${retval}
fi
if [ -z "$AMBARI_RELOCATE_PROBE" ] ; then
AMBARI_RELOCATE_PROBE="relocate_resources.py"
fi
NEW_HOSTNAME=$(hostname -f)
if [ "${OCF_RESKEY_daemon}" == "namenode" ] ; then
SERVICE_NAME="HDFS"
COMP_NAME="NAMENODE"
elif [ "${OCF_RESKEY_daemon}" == "jobtracker" ] ; then
SERVICE_NAME="MAPREDUCE"
COMP_NAME="JOBTRACKER"
elif [ "${OCF_RESKEY_daemon}" == "historyserver" ] ; then
SERVICE_NAME="MAPREDUCE"
COMP_NAME="JOBTRACKER"
else
ocf_log err "Unknown daemon ${OCF_RESKEY_daemon}"
return ${OCF_ERR_ARGS};
fi
if [ -n "${AMBARI_OUTPUT}" ]; then
OUPUT_FILE_CMD="-o ${AMBARI_OUTPUT}"
fi
"${AMBARI_RELOCATE_PROBE}" -s ${AMBARI_SERVER} -p ${AMBARI_PORT} -r ${AMBARI_PROTOCOL} -c ${AMBARI_CLUSTER} -e "${SERVICE_NAME}" -m "${COMP_NAME}" -n "${NEW_HOSTNAME}" -u "${AMBARI_USER}" -w "${AMBARI_PASSWD} ${OUPUT_FILE_CMD}"
retval=$?
if [ $retval -eq 0 ] ; then
ocf_log info "Ambari master successfully relocated."
elif [ $retval -eq 1 ] ; then
ocf_log error "Ambari relocate master failed. Continuing with failover..."
elif [ $retval -eq 2 ] ; then
ocf_log info "No action required from ambari probe."
elif [ $retval -eq 3 ] ; then
ocf_log err "Ambari relocate request verification failed. Exiting..."
exit ${retval}
else
ocf_log error "Unknown return code from ambari probe ${retval}."
fi
return $retval
}
# Read Ambari properties as comma separated key value pairs from cluster.conf
# Property name: 'ambariproperties'.
# Example property value:
# ambariproperties="server=localhost,port=8080,protocol=http,user=admin,password=admin,cluster=c1,output=/var/log/ambari_relocate.log"
parse_and_validate_ambari_properties() {
if [ -n "${OCF_RESKEY_ambariproperties}" ] ; then
ocf_log info "Ambari properties found: ${OCF_RESKEY_ambariproperties}"
IFS=',' read -ra properties <<< $OCF_RESKEY_ambariproperties
for i in $properties; do
if [[ "$i" == "server"* ]] ; then AMBARI_SERVER=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "port"* ]] ; then AMBARI_PORT=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "protocol"* ]] ; then AMBARI_PROTOCOL=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "user"* ]] ; then AMBARI_USER=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "password"* ]] ; then AMBARI_PASSWD=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "cluster"* ]] ; then AMBARI_CLUSTER=$(echo $i | cut -d"=" -f2); fi
if [[ "$i" == "output"* ]] ; then AMBARI_OUTPUT=$(echo $i | cut -d"=" -f2); fi
done
if [ -z "${AMBARI_SERVER}" ] ; then
ocf_log err "required ambari property 'server' is unset"
return 1
fi
if [ -z "${AMBARI_PORT}" ] ; then
ocf_log err "required ambari property 'port' is unset"
return 1
fi
if [ -z "${AMBARI_PROTOCOL}" ] ; then
ocf_log err "required ambari property 'protocol' is unset"
return 1
fi
if [ -z "${AMBARI_USER}" ] ; then
ocf_log err "required ambari property 'user' is unset"
return 1
fi
if [ -z "${AMBARI_PASSWD}" ] ; then
ocf_log err "required ambari property 'password' is unset"
return 1
fi
if [ -z "${AMBARI_CLUSTER}" ] ; then
ocf_log err "required ambari property 'cluster' is unset"
return 1
fi
else
ocf_log info "No Ambari properties found."
return 2
fi
}
# validate the arguments to the service.
# this assumes that the defaults have been pushed in so only check for existence of the mandatory properties
# and that the numeric properties are valid
validate_arguments_and_state() {
if [ "x" == "x${OCF_RESKEY_daemon}" ] ; then
dump_environment
ocf_log err "required property 'daemon' is unset"
return ${OCF_ERR_ARGS};
fi
if ! ocf_is_decimal "${OCF_RESKEY_boottime}"; then
ocf_log err "Option 'boottime' is not numeric!"
return ${OCF_ERR_CONFIGURED}
fi
if ! ocf_is_decimal "${OCF_RESKEY_probetime}"; then
ocf_log err "Option 'probetime' is not numeric!"
return ${OCF_ERR_CONFIGURED}
fi
verify_binary
return $?
}
# validate the arguments; exit with an error code
# if they are not
assert_arguments_are_valid() {
validate_arguments_and_state
retval=$?
[ ${retval} -ne 0 ] && exit ${retval}
}
# ================================================================================
# This is the live code
# ================================================================================
# Entry point checks parameters
fill_in_defaults
# then switch on the argument
case "$1" in
start)
assert_arguments_are_valid
[ $? -eq 0 ] && exit 0
execute_ambari_relocate_probe
start
exit $?
;;
stop)
# assert_arguments_are_valid
if ! stop; then
exit ${OCF_ERR_GENERIC}
fi
exit 0
;;
status|monitor)
# check the status of the live system
status_check
;;
meta-data)
# generate the metadata
metadata
exit 0
;;
recover|restart)
# validate_arguments_and_state
execute_ambari_relocate_probe
ocf_log info "Service restart"
$0 stop || exit ${OCF_ERR_GENERIC}
$0 start || exit ${OCF_ERR_GENERIC}
exit 0
;;
validate-all)
validate_arguments_and_state
exit $?
;;
# this is a non-standard operation to work out what is going on
diagnostics)
echo PATH=${PATH}
echo java is at `which java`
echo JAVA_HOME is ${JAVA_HOME}
dump_environment
exit 0
;;
*)
echo $"Usage: $0 {start|stop|status|monitor|restart|recover|validate-all|meta-data|diagnostics}"
exit ${OCF_ERR_UNIMPLEMENTED}
;;
esac