| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| |
| |
| # This is a resource agent for controlling hadoop daemons from |
| # cluster. |
| |
| # Source function library |
| . /etc/init.d/functions |
| |
| # OCF_ROOT is ./usr/lib/ocf |
| : ${OCF_FUNCTIONS_DIR=$(dirname $0)} |
| . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs |
| |
| |
| |
| # Source networking configuration |
| [ -f /etc/sysconfig/network ] && . /etc/sysconfig/network |
| |
| # Check that networking is up |
| [ "${NETWORKING}" = "no" ] && exit ${OCF_ERR_INSTALLED} |
| |
| # Pull in Hadoop facts |
| |
| . /etc/default/hadoop |
| . /etc/hadoop/conf/hadoop-env.sh |
| |
| if [ "${OCF_RESKEY_daemon}" == "namenode" ]; then |
| user="${HADOOP_NAMENODE_USER}" |
| else |
| user="${HADOOP_JOBTRACKER_USER}" |
| fi |
| |
| # The program being managed |
| program=hadoop-daemon.sh |
| DAEMON=${HADOOP_HOME}/bin/$program |
| # the HA probe script |
| HAPROBE=${HADOOP_HOME}/monitor/haprobe.sh |
| |
| |
| |
| #This isn't in the 5x so here is a rewrite of the core operations |
| # Input: a command and arguments |
| # out: 0 or OCF_ERR_GENERIC. |
| ocf_run() { |
| |
| out=`"$@" 2>&1` |
| #`"$@"` |
| retval=$? |
| if ((${retval} == 0)) |
| then |
| ocf_log info ${out} |
| else |
| echo $out |
| ocf_log err ${out} |
| ocf_log err "Command $* failed with return code ${retval}" |
| retval=${OCF_ERR_GENERIC} |
| fi |
| return ${retval}; |
| } |
| |
| |
| #this is here as the ocf command is missing |
| ocf_is_decimal () { |
| let i=10#$1 2>/dev/null; |
| } |
| |
| # Generate the metadata about this cluster entry |
| |
| # IMPORTANT WARNING FOR PEOPLE MAINTAINING THIS |
| # NO NOT PUT ANY QUOTES IN THE DESCRIPTION TEXT. |
| # --THESE ARE CONVERTED INTO ATTRIBUTES FOR SCHEMA VALIDATION; QUOTES BREAKS THIS |
| # |
| |
| metadata() { |
| cat <<EOT |
| <?xml version="1.0"?> |
| <!DOCTYPE resource-agent SYSTEM "ra-api-1-modified.dtd"> |
| <resource-agent version="rgmanager 2.0" name="hadoop"> |
| <version>1.0</version> |
| |
| <longdesc lang="en"> |
| Apache Hadoop resource agent |
| </longdesc> |
| <shortdesc lang="en"> |
| hadoop resource agent |
| </shortdesc> |
| |
| <parameters> |
| <parameter name="name" unique="1" primary="1"> |
| <shortdesc lang="en"> |
| Symbolic name for this hadoop service |
| </shortdesc> |
| <longdesc lang="en"> |
| Symbolic name for this hadoop service, e.g. NameNode Process |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="daemon" unique="0" required="1"> |
| <shortdesc lang="en"> |
| The hadoop daemon name to run |
| </shortdesc> |
| <longdesc lang="en"> |
| The hadoop daemon name to run, e.g. namenode |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="ambariproperties" unique="0" required="0"> |
| <shortdesc lang="en"> |
| Ambari properties as comma separated key value pairs |
| </shortdesc> |
| <longdesc lang="en"> |
| Example property value: |
| ambariproperties="server=localhost,port=8080,protocol=http,user=admin,password=admin,cluster=c1,output=/var/log/ambari_relocate.log" |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="url" unique="0" required="0"> |
| <shortdesc lang="en"> |
| URL to probe, use empty string or null to indicate undefined |
| </shortdesc> |
| <longdesc lang="en"> |
| URL to probe, use empty string or null to indicate undefined |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="pid" unique="0" required="0"> |
| <shortdesc lang="en"> |
| The filename of any .pid file to monitor. |
| </shortdesc> |
| <longdesc lang="en"> |
| The filename of any .pid file identifying a process to monitor. |
| This is of little benefit when monitoring a live cluster, as the HTTP and IPC |
| probes are more rigorous. Probing the process by pay of the pid file |
| is most useful during startup, as it can detect the failure of a process |
| early. |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="path" unique="0" required="0"> |
| <shortdesc lang="en"> |
| The directory path in HDFS to probe |
| </shortdesc> |
| <longdesc lang="en"> |
| The path in the HDFS filesystem to probe; default is "/" |
| </longdesc> |
| <content type="string"/> |
| </parameter> |
| |
| <parameter name="boottime" unique="0" required="0"> |
| <shortdesc lang="en"> |
| The time in milliseconds that the service is required to be live by. |
| </shortdesc> |
| <longdesc lang="en"> |
| The time in milliseconds that the service is required to be live by. |
| For the Namenode, this includes the time to replay the edit log. |
| </longdesc> |
| <content type="integer" default="180000"/> |
| </parameter> |
| |
| <parameter name="probetime" unique="0" required="0"> |
| <shortdesc lang="en"> |
| The time in milliseconds that a probe should take. |
| </shortdesc> |
| <longdesc lang="en"> |
| The maximum time in milliseconds that a probe should take. This must be |
| long enough to cover GC pauses, so that a long GC does not get mistaken |
| for a hung process. |
| </longdesc> |
| <content type="integer" default="120000"/> |
| </parameter> |
| |
| <parameter name="stoptime" unique="0" required="0"> |
| <shortdesc lang="en"> |
| The time in milliseconds that the service is required to be stop gracefully by. |
| </shortdesc> |
| <longdesc lang="en"> |
| The time in milliseconds that the service is required to to come to |
| a clean halt. |
| If the process has not finished by the end of this time period, it |
| is forcefully killed via a kill-9 command. |
| </longdesc> |
| <content type="integer" default="60000"/> |
| </parameter> |
| |
| <parameter name="waitfs" unique="0" required="0"> |
| <shortdesc lang="en"> |
| flag to indicate whether or not the filesystem needs to come up first |
| </shortdesc> |
| <longdesc lang="en"> |
| Indicate that the HA monitor should wait until the fs is live before |
| declaring that the service is live |
| </longdesc> |
| <content type="boolean" default="false"/> |
| </parameter> |
| |
| </parameters> |
| |
| <actions> |
| |
| <!-- start time doesnt provide a timeout hint as waitfs actions |
| may need to block startup for an extended period of time. --> |
| <action name="start" /> |
| <action name="stop" timeout="100s"/> |
| <!-- includes shutdown time and edit log time --> |
| <action name="recover" timeout="4m"/> |
| |
| <!-- Regular status check --> |
| <action name="monitor" interval="20s" timeout="120s"/> |
| <action name="status" interval="20s" timeout="120s"/> |
| |
| <!-- Depth checks --> |
| <!-- This depth checks hdfs is accessible --> |
| <!-- <action name="monitor" depth="10" interval="30s" timeout="120s"/> --> |
| <!-- <action name="status" depth="10" interval="30s" timeout="120s"/> --> |
| |
| <action name="meta-data" timeout="5s"/> |
| <action name="validate-all" timeout="5s"/> |
| </actions> |
| </resource-agent> |
| EOT |
| } |
| |
| #If you want to test the scripts, set some properties |
| # export OCF_RESKEY_httpport="50070" |
| # export OCF_RESKEY_daemon="namenode" |
| # export OCF_RESKEY_ip="localhost" |
| # export OCF_CHECK_LEVEL="100" |
| |
| |
| # Start the operation |
| start() { |
| assert_binary |
| |
| ocf_log info "Starting hadoop-${OCF_RESKEY_daemon}" |
| daemon --user ${user} --check ${DAEMON} ${DAEMON} --config /etc/hadoop/conf start ${OCF_RESKEY_daemon} |
| RETVAL=$? |
| if [ ${RETVAL} -ne 0 ]; then |
| ocf_log err "Failed to start ${DAEMON}: ${RETVAL}" |
| return ${RETVAL} |
| fi |
| sleep 15 |
| |
| dfs_bootstrap_check |
| RETVAL=$? |
| echo |
| if [ ${RETVAL} -ne 0 ]; then |
| ocf_log err "Failed to start dfs_bootstrap_check}: ${RETVAL}" |
| return ${OCF_ERR_GENERIC} |
| fi |
| return 0 |
| } |
| |
| stop() { |
| HADOOP_STOP_TIMEOUT=${OCF_RESKEY_stoptime} |
| ocf_log info "Stopping hadoop-${OCF_RESKEY_daemon} with timeout ${HADOOP_STOP_TIMEOUT}" |
| daemon --user ${user} --check ${DAEMON} ${DAEMON} --config /etc/hadoop/conf stop ${OCF_RESKEY_daemon} |
| RETVAL=$? |
| ocf_log info "stop command issued, retval is ${RETVAL}" |
| if [ ${RETVAL} -ne 0 ]; then |
| ocf_log err "Failed to stop ${program} ${OCF_RESKEY_daemon}" |
| return ${OCF_ERR_GENERIC} |
| fi |
| |
| echo |
| # Spin waiting for shutdown |
| |
| # while url_check |
| # do |
| # ocf_log debug "Resource has not stopped yet, waiting" |
| # sleep 15 |
| # done |
| # |
| return ${OCF_SUCCESS} |
| } |
| |
| # |
| # Verify the binary is installed |
| # |
| # Usage: verify_binary |
| # Result: $OCF_ERR_INSTALLED = binary not installed |
| # 0 = binary installed |
| # |
| verify_binary() { |
| # Report that $prog does not exist, or is not executable |
| if [ ! -x "${DAEMON}" ]; then |
| ocf_log err "Binary ${DAEMON} doesn't exist" |
| return ${OCF_ERR_INSTALLED} |
| fi |
| return ${OCF_SUCCESS} |
| } |
| |
| assert_binary() { |
| verify_binary || exit $? |
| } |
| |
| |
| |
| |
| # status checking. |
| # This exits during its execution, as this simplifies |
| # the logic for different layers of check |
| status_check() { |
| |
| # assert_arguments_are_valid |
| ocf_log info "Checking ${OCF_RESKEY_daemon}, Level ${OCF_CHECK_LEVEL}" |
| |
| #look for the check level as in some tests it isn't set |
| if [ "x" == "x${OCF_CHECK_LEVEL}" ] |
| then |
| ocf_log err "Environment variable OCF_CHECK_LEVEL not set" |
| exit ${OCF_ERR_ARGS} |
| fi |
| retval=0 |
| # website check |
| # url_check |
| # retval=$? |
| |
| # retval=pid_check |
| # if [ $retval -ne 0 ] |
| # then |
| # exit ${retval} |
| # fi |
| # |
| # [ "${OCF_CHECK_LEVEL}" -lt 10 ] && exit ${retval} |
| |
| # Depth level 10 check |
| |
| |
| dfs_check |
| retval=$? |
| # |
| # if [ $? -ne 0 ]; then |
| # retval=${OCF_NOT_RUNNING} |
| # fi |
| exit ${retval} |
| } |
| |
| |
| # HA probe |
| dfs_check() { |
| |
| ocf_run "${HAPROBE}" --file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} --timeout ${OCF_RESKEY_probetime} |
| if [ $? -ne 0 ] |
| then |
| ocf_log warn "Service ${OCF_RESKEY_daemon} is not running according to checks: -file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} " |
| return ${OCF_NOT_RUNNING} |
| fi |
| return ${OCF_SUCCESS} |
| } |
| |
| # Run a bootstrap check |
| # this can include different probes and timeouts |
| dfs_bootstrap_check() { |
| |
| ocf_run "${HAPROBE}" --file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} --timeout ${OCF_RESKEY_probetime} --boottimeout ${OCF_RESKEY_boottime} --waitfs ${OCF_RESKEY_waitfs} |
| if [ $? -ne 0 ] |
| then |
| ocf_log warn "Service ${OCF_RESKEY_daemon} is not booting according to checks: -file ${OCF_RESKEY_path} --pid ${OCF_RESKEY_pid} --url ${OCF_RESKEY_url} " |
| return ${OCF_NOT_RUNNING} |
| fi |
| return ${OCF_SUCCESS} |
| } |
| |
| |
| # this is a PID check |
| pid_check() { |
| ocf_run "${HAPROBE} --pid ${OCF_RESKEY_pid}" |
| if [ $? -ne 0 ]; then |
| return ${OCF_NOT_RUNNING} |
| fi |
| return ${OCF_SUCCESS} |
| } |
| |
| # fill in the default values of a service |
| fill_in_defaults() { |
| : ${OCF_RESKEY_boottime="180000"} |
| : ${OCF_RESKEY_daemon="namenode"} |
| : ${OCF_RESKEY_httpport="50070"} |
| : ${OCF_RESKEY_ip="localhost"} |
| : ${OCF_RESKEY_path="/"} |
| : ${OCF_RESKEY_pid="null"} |
| : ${OCF_RESKEY_probetime="120000"} |
| : ${OCF_RESKEY_stoptime="60000"} |
| : ${OCF_RESKEY_url="http://localhost:50070/"} |
| : ${OCF_RESKEY_waitfs="false"} |
| } |
| |
| dump_environment() { |
| ocf_log info `env` |
| } |
| |
| # Relocate Ambari managed master to current host on failover |
| execute_ambari_relocate_probe() { |
| retval = parse_and_validate_ambari_properties |
| if [ $retval -eq 2 ] ; then |
| return 0 |
| elif [ $retval -eq 1 ] ; then |
| exit ${retval} |
| fi |
| |
| if [ -z "$AMBARI_RELOCATE_PROBE" ] ; then |
| AMBARI_RELOCATE_PROBE="relocate_resources.py" |
| fi |
| |
| NEW_HOSTNAME=$(hostname -f) |
| |
| if [ "${OCF_RESKEY_daemon}" == "namenode" ] ; then |
| SERVICE_NAME="HDFS" |
| COMP_NAME="NAMENODE" |
| elif [ "${OCF_RESKEY_daemon}" == "jobtracker" ] ; then |
| SERVICE_NAME="MAPREDUCE" |
| COMP_NAME="JOBTRACKER" |
| elif [ "${OCF_RESKEY_daemon}" == "historyserver" ] ; then |
| SERVICE_NAME="MAPREDUCE" |
| COMP_NAME="JOBTRACKER" |
| else |
| ocf_log err "Unknown daemon ${OCF_RESKEY_daemon}" |
| return ${OCF_ERR_ARGS}; |
| fi |
| |
| if [ -n "${AMBARI_OUTPUT}" ]; then |
| OUPUT_FILE_CMD="-o ${AMBARI_OUTPUT}" |
| fi |
| |
| "${AMBARI_RELOCATE_PROBE}" -s ${AMBARI_SERVER} -p ${AMBARI_PORT} -r ${AMBARI_PROTOCOL} -c ${AMBARI_CLUSTER} -e "${SERVICE_NAME}" -m "${COMP_NAME}" -n "${NEW_HOSTNAME}" -u "${AMBARI_USER}" -w "${AMBARI_PASSWD} ${OUPUT_FILE_CMD}" |
| |
| retval=$? |
| if [ $retval -eq 0 ] ; then |
| ocf_log info "Ambari master successfully relocated." |
| elif [ $retval -eq 1 ] ; then |
| ocf_log error "Ambari relocate master failed. Continuing with failover..." |
| elif [ $retval -eq 2 ] ; then |
| ocf_log info "No action required from ambari probe." |
| elif [ $retval -eq 3 ] ; then |
| ocf_log err "Ambari relocate request verification failed. Exiting..." |
| exit ${retval} |
| else |
| ocf_log error "Unknown return code from ambari probe ${retval}." |
| fi |
| |
| return $retval |
| } |
| |
| # Read Ambari properties as comma separated key value pairs from cluster.conf |
| # Property name: 'ambariproperties'. |
| # Example property value: |
| # ambariproperties="server=localhost,port=8080,protocol=http,user=admin,password=admin,cluster=c1,output=/var/log/ambari_relocate.log" |
| parse_and_validate_ambari_properties() { |
| if [ -n "${OCF_RESKEY_ambariproperties}" ] ; then |
| ocf_log info "Ambari properties found: ${OCF_RESKEY_ambariproperties}" |
| |
| IFS=',' read -ra properties <<< $OCF_RESKEY_ambariproperties |
| |
| for i in $properties; do |
| if [[ "$i" == "server"* ]] ; then AMBARI_SERVER=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "port"* ]] ; then AMBARI_PORT=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "protocol"* ]] ; then AMBARI_PROTOCOL=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "user"* ]] ; then AMBARI_USER=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "password"* ]] ; then AMBARI_PASSWD=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "cluster"* ]] ; then AMBARI_CLUSTER=$(echo $i | cut -d"=" -f2); fi |
| if [[ "$i" == "output"* ]] ; then AMBARI_OUTPUT=$(echo $i | cut -d"=" -f2); fi |
| done |
| |
| if [ -z "${AMBARI_SERVER}" ] ; then |
| ocf_log err "required ambari property 'server' is unset" |
| return 1 |
| fi |
| |
| if [ -z "${AMBARI_PORT}" ] ; then |
| ocf_log err "required ambari property 'port' is unset" |
| return 1 |
| fi |
| |
| if [ -z "${AMBARI_PROTOCOL}" ] ; then |
| ocf_log err "required ambari property 'protocol' is unset" |
| return 1 |
| fi |
| |
| if [ -z "${AMBARI_USER}" ] ; then |
| ocf_log err "required ambari property 'user' is unset" |
| return 1 |
| fi |
| |
| if [ -z "${AMBARI_PASSWD}" ] ; then |
| ocf_log err "required ambari property 'password' is unset" |
| return 1 |
| fi |
| |
| if [ -z "${AMBARI_CLUSTER}" ] ; then |
| ocf_log err "required ambari property 'cluster' is unset" |
| return 1 |
| fi |
| |
| else |
| ocf_log info "No Ambari properties found." |
| return 2 |
| fi |
| } |
| |
| # validate the arguments to the service. |
| # this assumes that the defaults have been pushed in so only check for existence of the mandatory properties |
| # and that the numeric properties are valid |
| validate_arguments_and_state() { |
| if [ "x" == "x${OCF_RESKEY_daemon}" ] ; then |
| dump_environment |
| ocf_log err "required property 'daemon' is unset" |
| return ${OCF_ERR_ARGS}; |
| fi |
| |
| if ! ocf_is_decimal "${OCF_RESKEY_boottime}"; then |
| ocf_log err "Option 'boottime' is not numeric!" |
| return ${OCF_ERR_CONFIGURED} |
| fi |
| |
| if ! ocf_is_decimal "${OCF_RESKEY_probetime}"; then |
| ocf_log err "Option 'probetime' is not numeric!" |
| return ${OCF_ERR_CONFIGURED} |
| fi |
| |
| verify_binary |
| return $? |
| } |
| |
| # validate the arguments; exit with an error code |
| # if they are not |
| assert_arguments_are_valid() { |
| validate_arguments_and_state |
| retval=$? |
| [ ${retval} -ne 0 ] && exit ${retval} |
| } |
| |
| |
| # ================================================================================ |
| # This is the live code |
| # ================================================================================ |
| # Entry point checks parameters |
| fill_in_defaults |
| |
| # then switch on the argument |
| case "$1" in |
| start) |
| assert_arguments_are_valid |
| [ $? -eq 0 ] && exit 0 |
| execute_ambari_relocate_probe |
| start |
| exit $? |
| ;; |
| |
| stop) |
| # assert_arguments_are_valid |
| if ! stop; then |
| exit ${OCF_ERR_GENERIC} |
| fi |
| exit 0 |
| ;; |
| |
| status|monitor) |
| # check the status of the live system |
| status_check |
| ;; |
| |
| meta-data) |
| # generate the metadata |
| metadata |
| exit 0 |
| ;; |
| |
| recover|restart) |
| # validate_arguments_and_state |
| execute_ambari_relocate_probe |
| ocf_log info "Service restart" |
| $0 stop || exit ${OCF_ERR_GENERIC} |
| $0 start || exit ${OCF_ERR_GENERIC} |
| exit 0 |
| ;; |
| |
| validate-all) |
| validate_arguments_and_state |
| exit $? |
| ;; |
| |
| # this is a non-standard operation to work out what is going on |
| diagnostics) |
| echo PATH=${PATH} |
| echo java is at `which java` |
| echo JAVA_HOME is ${JAVA_HOME} |
| dump_environment |
| exit 0 |
| ;; |
| |
| *) |
| echo $"Usage: $0 {start|stop|status|monitor|restart|recover|validate-all|meta-data|diagnostics}" |
| exit ${OCF_ERR_UNIMPLEMENTED} |
| ;; |
| |
| esac |
| |