blob: 5f3531d2e092609aee7fab1ec042dba1515d9a98 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
import org.apache.hadoop.util.StringUtils;
/**
* The Class HealthCheckChore for running health checker regularly.
*/
public class HealthCheckChore extends ScheduledChore {
private static final Log LOG = LogFactory.getLog(HealthCheckChore.class);
private HealthChecker healthChecker;
private Configuration config;
private int threshold;
private int numTimesUnhealthy = 0;
private long failureWindow;
private long startWindow;
public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
super("HealthChecker", stopper, sleepTime);
LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
this.config = conf;
String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
healthChecker = new HealthChecker();
healthChecker.init(healthCheckScript, scriptTimeout);
this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
this.failureWindow = (long)this.threshold * (long)sleepTime;
}
@Override
protected void chore() {
HealthReport report = healthChecker.checkHealth();
boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
if (!isHealthy) {
boolean needToStop = decideToStop();
if (needToStop) {
this.getStopper().stop("The node reported unhealthy " + threshold
+ " number of times consecutively.");
}
// Always log health report.
LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : "
+ report.getHealthReport());
}
}
private boolean decideToStop() {
boolean stop = false;
if (numTimesUnhealthy == 0) {
// First time we are seeing a failure. No need to stop, just
// record the time.
numTimesUnhealthy++;
startWindow = System.currentTimeMillis();
} else {
if ((System.currentTimeMillis() - startWindow) < failureWindow) {
numTimesUnhealthy++;
if (numTimesUnhealthy == threshold) {
stop = true;
} else {
stop = false;
}
} else {
// Outside of failure window, so we reset to 1.
numTimesUnhealthy = 1;
startWindow = System.currentTimeMillis();
stop = false;
}
}
return stop;
}
}