| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.mapred; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Timer; |
| import java.util.TimerTask; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.mapred.TaskTrackerStatus.TaskTrackerHealthStatus; |
| import org.apache.hadoop.mapreduce.server.tasktracker.TTConfig; |
| import org.apache.hadoop.util.StringUtils; |
| import org.apache.hadoop.util.Shell.ExitCodeException; |
| import org.apache.hadoop.util.Shell.ShellCommandExecutor; |
| |
| /** |
| * |
| * The class which provides functionality of checking the health of the node and |
| * reporting back to the service for which the health checker has been asked to |
| * report. |
| */ |
| class NodeHealthCheckerService { |
| |
| private static Log LOG = LogFactory.getLog(NodeHealthCheckerService.class); |
| |
| /** Absolute path to the health script. */ |
| private String nodeHealthScript; |
| /** Delay after which node health script to be executed */ |
| private long intervalTime; |
| /** Time after which the script should be timedout */ |
| private long scriptTimeout; |
| /** Timer used to schedule node health monitoring script execution */ |
| private Timer nodeHealthScriptScheduler; |
| |
| /** ShellCommandExecutor used to execute monitoring script */ |
| ShellCommandExecutor shexec = null; |
| |
| /** Configuration used by the checker */ |
| private Configuration conf; |
| |
| /** Pattern used for searching in the output of the node health script */ |
| static private final String ERROR_PATTERN = "ERROR"; |
| |
| /* Configuration keys */ |
| static final String HEALTH_CHECK_SCRIPT_PROPERTY = |
| TTConfig.TT_HEALTH_CHECKER_SCRIPT_PATH; |
| |
| static final String HEALTH_CHECK_INTERVAL_PROPERTY = |
| TTConfig.TT_HEALTH_CHECKER_INTERVAL; |
| |
| static final String HEALTH_CHECK_FAILURE_INTERVAL_PROPERTY = |
| TTConfig.TT_HEALTH_CHECKER_SCRIPT_TIMEOUT; |
| |
| static final String HEALTH_CHECK_SCRIPT_ARGUMENTS_PROPERTY = |
| TTConfig.TT_HEALTH_CHECKER_SCRIPT_ARGS; |
| |
| /* end of configuration keys */ |
| /** Time out error message */ |
| static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; |
| |
| /** Default frequency of running node health script */ |
| private static final long DEFAULT_HEALTH_CHECK_INTERVAL = 10 * 60 * 1000; |
| /** Default script time out period */ |
| private static final long DEFAULT_HEALTH_SCRIPT_FAILURE_INTERVAL = 2 * DEFAULT_HEALTH_CHECK_INTERVAL; |
| |
| private boolean isHealthy; |
| |
| private String healthReport; |
| |
| private long lastReportedTime; |
| |
| private TimerTask timer; |
| |
| |
| private enum HealthCheckerExitStatus { |
| SUCCESS, |
| TIMED_OUT, |
| FAILED_WITH_EXIT_CODE, |
| FAILED_WITH_EXCEPTION, |
| FAILED |
| } |
| |
| |
| /** |
| * Class which is used by the {@link Timer} class to periodically execute the |
| * node health script. |
| * |
| */ |
| private class NodeHealthMonitorExecutor extends TimerTask { |
| |
| String exceptionStackTrace = ""; |
| |
| public NodeHealthMonitorExecutor(String[] args) { |
| ArrayList<String> execScript = new ArrayList<String>(); |
| execScript.add(nodeHealthScript); |
| if (args != null) { |
| execScript.addAll(Arrays.asList(args)); |
| } |
| shexec = new ShellCommandExecutor(execScript |
| .toArray(new String[execScript.size()]), null, null, scriptTimeout); |
| } |
| |
| @Override |
| public void run() { |
| HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS; |
| try { |
| shexec.execute(); |
| } catch (ExitCodeException e) { |
| // ignore the exit code of the script |
| status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE; |
| } catch (Exception e) { |
| LOG.warn("Caught exception : " + e.getMessage()); |
| if (!shexec.isTimedOut()) { |
| status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION; |
| } else { |
| status = HealthCheckerExitStatus.TIMED_OUT; |
| } |
| exceptionStackTrace = StringUtils.stringifyException(e); |
| } finally { |
| if (status == HealthCheckerExitStatus.SUCCESS) { |
| if (hasErrors(shexec.getOutput())) { |
| status = HealthCheckerExitStatus.FAILED; |
| } |
| } |
| reportHealthStatus(status); |
| } |
| } |
| |
| /** |
| * Method which is used to parse output from the node health monitor and |
| * send to the report address. |
| * |
| * The timed out script or script which causes IOException output is |
| * ignored. |
| * |
| * The node is marked unhealthy if |
| * <ol> |
| * <li>The node health script times out</li> |
| * <li>The node health scripts output has a line which begins with ERROR</li> |
| * <li>An exception is thrown while executing the script</li> |
| * </ol> |
| * If the script throws {@link IOException} or {@link ExitCodeException} the |
| * output is ignored and node is left remaining healthy, as script might |
| * have syntax error. |
| * |
| * @param status |
| */ |
| void reportHealthStatus(HealthCheckerExitStatus status) { |
| long now = System.currentTimeMillis(); |
| switch (status) { |
| case SUCCESS: |
| setHealthStatus(true, "", now); |
| break; |
| case TIMED_OUT: |
| setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG); |
| break; |
| case FAILED_WITH_EXCEPTION: |
| setHealthStatus(false, exceptionStackTrace); |
| break; |
| case FAILED_WITH_EXIT_CODE: |
| setHealthStatus(true, "", now); |
| break; |
| case FAILED: |
| setHealthStatus(false, shexec.getOutput()); |
| break; |
| } |
| } |
| |
| /** |
| * Method to check if the output string has line which begins with ERROR. |
| * |
| * @param output |
| * string |
| * @return true if output string has error pattern in it. |
| */ |
| private boolean hasErrors(String output) { |
| String[] splits = output.split("\n"); |
| for (String split : splits) { |
| if (split.startsWith(ERROR_PATTERN)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| } |
| |
| public NodeHealthCheckerService(Configuration conf) { |
| this.conf = conf; |
| this.lastReportedTime = System.currentTimeMillis(); |
| this.isHealthy = true; |
| this.healthReport = ""; |
| initialize(conf); |
| } |
| |
| /* |
| * Method which initializes the values for the script path and interval time. |
| */ |
| private void initialize(Configuration conf) { |
| this.nodeHealthScript = |
| conf.get(TTConfig.TT_HEALTH_CHECKER_SCRIPT_PATH); |
| this.intervalTime = conf.getLong(TTConfig.TT_HEALTH_CHECKER_INTERVAL, |
| DEFAULT_HEALTH_CHECK_INTERVAL); |
| this.scriptTimeout = conf.getLong( |
| TTConfig.TT_HEALTH_CHECKER_SCRIPT_TIMEOUT, |
| DEFAULT_HEALTH_SCRIPT_FAILURE_INTERVAL); |
| String[] args = conf.getStrings(TTConfig.TT_HEALTH_CHECKER_SCRIPT_ARGS, |
| new String[] {}); |
| timer = new NodeHealthMonitorExecutor(args); |
| } |
| |
| /** |
| * Method used to start the Node health monitoring. |
| * |
| */ |
| void start() { |
| // if health script path is not configured don't start the thread. |
| if (!shouldRun(conf)) { |
| LOG.info("Not starting node health monitor"); |
| return; |
| } |
| nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true); |
| // Start the timer task immediately and |
| // then periodically at interval time. |
| nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime); |
| } |
| |
| /** |
| * Method used to terminate the node health monitoring service. |
| * |
| */ |
| void stop() { |
| if (!shouldRun(conf)) { |
| return; |
| } |
| nodeHealthScriptScheduler.cancel(); |
| if (shexec != null) { |
| Process p = shexec.getProcess(); |
| if (p != null) { |
| p.destroy(); |
| } |
| } |
| } |
| |
| /** |
| * Gets the if the node is healthy or not |
| * |
| * @return true if node is healthy |
| */ |
| private boolean isHealthy() { |
| return isHealthy; |
| } |
| |
| /** |
| * Sets if the node is healhty or not. |
| * |
| * @param isHealthy |
| * if or not node is healthy |
| */ |
| private synchronized void setHealthy(boolean isHealthy) { |
| this.isHealthy = isHealthy; |
| } |
| |
| /** |
| * Returns output from health script. if node is healthy then an empty string |
| * is returned. |
| * |
| * @return output from health script |
| */ |
| private String getHealthReport() { |
| return healthReport; |
| } |
| |
| /** |
| * Sets the health report from the node health script. |
| * |
| * @param healthReport |
| */ |
| private synchronized void setHealthReport(String healthReport) { |
| this.healthReport = healthReport; |
| } |
| |
| /** |
| * Returns time stamp when node health script was last run. |
| * |
| * @return timestamp when node health script was last run |
| */ |
| private long getLastReportedTime() { |
| return lastReportedTime; |
| } |
| |
| /** |
| * Sets the last run time of the node health script. |
| * |
| * @param lastReportedTime |
| */ |
| private synchronized void setLastReportedTime(long lastReportedTime) { |
| this.lastReportedTime = lastReportedTime; |
| } |
| |
| /** |
| * Method used to determine if or not node health monitoring service should be |
| * started or not. Returns true if following conditions are met: |
| * |
| * <ol> |
| * <li>Path to Node health check script is not empty</li> |
| * <li>Node health check script file exists</li> |
| * </ol> |
| * |
| * @param conf |
| * @return true if node health monitoring service can be started. |
| */ |
| static boolean shouldRun(Configuration conf) { |
| String nodeHealthScript = |
| conf.get(TTConfig.TT_HEALTH_CHECKER_SCRIPT_PATH); |
| if (nodeHealthScript == null || nodeHealthScript.trim().isEmpty()) { |
| return false; |
| } |
| File f = new File(nodeHealthScript); |
| return f.exists() && f.canExecute(); |
| } |
| |
| private synchronized void setHealthStatus(boolean isHealthy, String output) { |
| this.setHealthy(isHealthy); |
| this.setHealthReport(output); |
| } |
| |
| private synchronized void setHealthStatus(boolean isHealthy, String output, |
| long time) { |
| this.setHealthStatus(isHealthy, output); |
| this.setLastReportedTime(time); |
| } |
| |
| /** |
| * Method to populate the fields for the {@link TaskTrackerHealthStatus} |
| * |
| * @param healthStatus |
| */ |
| synchronized void setHealthStatus(TaskTrackerHealthStatus healthStatus) { |
| healthStatus.setNodeHealthy(this.isHealthy()); |
| healthStatus.setHealthReport(this.getHealthReport()); |
| healthStatus.setLastReported(this.getLastReportedTime()); |
| } |
| |
| /** |
| * Test method to directly access the timer which node |
| * health checker would use. |
| * |
| * |
| * @return Timer task |
| */ |
| //XXX:Not to be used directly. |
| TimerTask getTimer() { |
| return timer; |
| } |
| } |