blob: f64ec8da039a1bff09bd77ee1bae27cfd1f26b0e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.slider.server.servicemonitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* This is the entry point to do work. A list of probes is taken in, in order of
* booting. Once live they go to the live probes list.
*
* The dependency probes are a set of probes for dependent services, all of which
* must be live before boot probes commence.
*
* The boot probes are executed and are allowed to fail; failure is interpreted as "not yet live"
*
* Once all boot probes are live, the live list is used for probes; these must not fail.
*
* There is no timeout on dependency probe bootstrap time, because of the notion that
* restarting this service will have no effect on the dependencies.
*/
public class ProbeWorker implements Runnable {
protected static final Logger log = LoggerFactory.getLogger(ProbeWorker.class);
public static final String FAILED_TO_BOOT = "Monitored service failed to bootstrap after ";
public static final String FAILURE_OF_A_LIVE_PROBE_DURING_BOOTSTRAPPING = "Failure of a live probe during bootstrapping";
private final List<Probe> monitorProbes;
private final List<Probe> dependencyProbes;
public final int interval;
protected volatile ProbeStatus lastStatus;
protected volatile ProbeStatus lastFailingBootstrapProbe;
protected volatile Probe currentProbe;
private volatile boolean mustExit;
private final int bootstrapTimeout;
private long bootstrapEndtime;
private ProbeReportHandler reportHandler;
private volatile ProbePhase probePhase = ProbePhase.INIT;
/**
* Create a probe worker
* @param monitorProbes list of probes that must boot and then go live -after which
* they must stay live.
* @param dependencyProbes the list of dependency probes that must all succeed before
* any attempt to probe the direct probe list is performed. Once the
* dependency phase has completed, these probes are never checked again.
* @param interval probe interval in milliseconds.
* @param bootstrapTimeout timeout for bootstrap in milliseconds
*/
public ProbeWorker(List<Probe> monitorProbes, List<Probe> dependencyProbes, int interval, int bootstrapTimeout) {
this.monitorProbes = monitorProbes;
this.dependencyProbes = dependencyProbes != null ? dependencyProbes : new ArrayList<Probe>(0);
this.interval = interval;
lastStatus = new ProbeStatus(now(),
"Initial status");
lastStatus.setProbePhase(ProbePhase.INIT);
this.bootstrapTimeout = bootstrapTimeout;
}
public void init() throws IOException {
for (Probe probe : monitorProbes) {
probe.init();
}
for (Probe probe : dependencyProbes) {
probe.init();
}
}
public void setReportHandler(ProbeReportHandler reportHandler) {
this.reportHandler = reportHandler;
}
public void setMustExit() {
this.mustExit = true;
}
public ProbeStatus getLastStatus() {
return lastStatus;
}
public synchronized Probe getCurrentProbe() {
return currentProbe;
}
public ProbePhase getProbePhase() {
return probePhase;
}
/**
* Enter the new process state, and report it to the report handler.
* This is synchronized just to make sure there isn't more than one
* invocation at the same time.
* @param status the new process status
*/
private synchronized void enterProbePhase(ProbePhase status) {
this.probePhase = status;
if (reportHandler != null) {
reportHandler.probeProcessStateChange(status);
}
}
/**
* Report the probe status to the listener -setting the probe phase field
* before doing so.
* The value is also stored in the {@link #lastStatus} field
* @param status the new status
*/
private void reportProbeStatus(ProbeStatus status) {
ProbePhase phase = getProbePhase();
status.setProbePhase(phase);
lastStatus = status;
reportHandler.probeResult(phase, status);
}
/**
* Ping one probe. Logs the operation at debug level; sets the field <code>currentProbe</code>
* to the probe for the duration of the operation -this is used when identifying the
* cause of a hung reporting loop
* @param probe probe to ping
* @param live flag to indicate whether or not the operation is live or bootstrapping
* @return the status of the ping
* @throws ProbeInterruptedException if the probe has been told to exit
*/
private ProbeStatus ping(Probe probe, boolean live) throws ProbeInterruptedException {
if (log.isDebugEnabled()) {
log.debug("Executing " + probe);
}
checkForExitRequest();
currentProbe = probe;
try {
return probe.ping(live);
} finally {
currentProbe = null;
}
}
/**
* Check for an exit request -and convert it to an exception if made
* @throws ProbeInterruptedException iff {@link #mustExit} is true
*/
private void checkForExitRequest() throws ProbeInterruptedException {
if (mustExit) {
throw new ProbeInterruptedException();
}
}
/**
* Check the dependencies.
* The moment a failing test is reached the call returns without
* any reporting.
*
* All successful probes are reported, so as to keep the heartbeats happy.
*
* @return the status of the last dependency check. If this is a success
* them every probe passed.
*/
private ProbeStatus checkDependencyProbes() throws ProbeInterruptedException {
ProbeStatus status = null;
for (Probe dependency : dependencyProbes) {
//ping them, making clear they are not to run any bootstrap logic
status = ping(dependency, true);
if (!status.isSuccess()) {
//the first failure means the rest of the list can be skipped
break;
}
reportProbeStatus(status);
}
//return the last status
return status;
}
/**
* Run through all the dependency probes and report their outcomes successes (even if they fail)
* @return true iff all the probes have succeeded.
* @throws ProbeInterruptedException if the process was interrupted.
*/
public boolean checkAndReportDependencyProbes() throws ProbeInterruptedException {
ProbeStatus status;
status = checkDependencyProbes();
if (status != null && !status.isSuccess()) {
//during dependency checking, a failure is still reported as a success
status.markAsSuccessful();
reportProbeStatus(status);
//then return without checking anything else
return false;
}
//all dependencies are done.
return true;
}
/**
* Begin bootstrapping by telling each probe that they have started.
* This sets the timeouts up, as well as permits any other set-up actions
* to begin.
*/
private void beginBootstrapProbes() {
synchronized (this) {
bootstrapEndtime = now() + bootstrapTimeout;
}
for (Probe probe : monitorProbes) {
probe.beginBootstrap();
}
}
private long now() {
return System.currentTimeMillis();
}
/**
* Check the bootstrap probe list. All successful probes get reported.
* The first unsuccessful probe will be returned and not reported (left for policy upstream).
* If the failing probe has timed out, that is turned into a {@link ProbeFailedException}
* @return the last (unsuccessful) probe, or null if they all succeeded
* @throws ProbeInterruptedException interrupts
* @throws ProbeFailedException on a boot timeout
*/
private boolean checkBootstrapProbes() throws ProbeInterruptedException, ProbeFailedException {
verifyBootstrapHasNotTimedOut();
boolean probeFailed = false;
//now run through all the bootstrap probes
for (Probe probe : monitorProbes) {
//ping them
ProbeStatus status = ping(probe, false);
if (!status.isSuccess()) {
probeFailed = true;
lastFailingBootstrapProbe = status;
probe.failureCount++;
if (log.isDebugEnabled()) {
log.debug("Booting probe failed: " + status);
}
//at this point check to see if the timeout has occurred -and if so, force in the last probe status.
//this is a failure but not a timeout
//during boot, a failure of a probe that hasn't booted is still reported as a success
if (!probe.isBooted()) {
//so the success bit is flipped
status.markAsSuccessful();
reportProbeStatus(status);
} else {
//the probe had booted but then it switched to failing
//update the status unedited
reportProbeStatus(status);
//then fail
throw raiseProbeFailure(status, FAILURE_OF_A_LIVE_PROBE_DURING_BOOTSTRAPPING);
}
} else {
//this probe is working
if (!probe.isBooted()) {
//if it is new, mark it as live
if (log.isDebugEnabled()) {
log.debug("Booting probe is now live: " + probe);
}
probe.endBootstrap();
//tell the report handler that another probe has booted
reportHandler.probeBooted(status);
}
//push out its status
reportProbeStatus(status);
probe.successCount++;
}
}
return !probeFailed;
}
public int getBootstrapTimeout() {
return bootstrapTimeout;
}
/**
* This checks that bootstrap operations have not timed out
* @throws ProbeFailedException if the bootstrap has failed
*/
public void verifyBootstrapHasNotTimedOut() throws ProbeFailedException {
//first step -look for a timeout
if (isBootstrapTimeExceeded()) {
String text = FAILED_TO_BOOT
+ MonitorUtils.millisToHumanTime(bootstrapTimeout);
ProbeStatus status;
if (lastFailingBootstrapProbe != null) {
status = lastFailingBootstrapProbe;
status.setSuccess(false);
} else {
status = new ProbeStatus();
status.finish(null, false, text, null);
}
throw raiseProbeFailure(status,
text);
}
}
/**
* predicate that gets current time and checks for its time being exceeded.
* @return true iff the current time is > the end time
*/
public synchronized boolean isBootstrapTimeExceeded() {
return now() > bootstrapEndtime;
}
/**
* run through all the bootstrap probes and see if they are live.
* @return true iff all boot probes succeeded
* @throws ProbeInterruptedException the probe interruption flags
* @throws ProbeFailedException if a probe failed.
*/
public boolean checkAndReportBootstrapProbes() throws ProbeInterruptedException,
ProbeFailedException {
if (bootstrapTimeout <= 0) {
//there is no period of grace for bootstrapping probes, so return true saying
//this phase is complete
return true;
}
//now the bootstrapping probes
return checkBootstrapProbes();
}
/**
* run through all the live probes, pinging and reporting them.
* A single probe failure is turned into an exception
* @throws ProbeFailedException a probe failed
* @throws ProbeInterruptedException the probe process was explicitly interrupted
*/
protected void checkAndReportLiveProbes() throws ProbeFailedException, ProbeInterruptedException {
ProbeStatus status = null;
//go through the live list
if (log.isDebugEnabled()) {
log.debug("Checking live probes");
}
for (Probe probe : monitorProbes) {
status = ping(probe, true);
reportProbeStatus(status);
if (!status.isSuccess()) {
throw raiseProbeFailure(status, "Failure of probe in \"live\" monitor");
}
probe.successCount++;
}
//here all is well, so notify the reporter
reportHandler.liveProbeCycleCompleted();
}
/**
* Run the set of probes relevant for this phase of the probe lifecycle.
* @throws ProbeFailedException a probe failed
* @throws ProbeInterruptedException the probe process was explicitly interrupted
*/
protected void executeProbePhases() throws ProbeFailedException, ProbeInterruptedException {
switch (probePhase) {
case INIT:
enterProbePhase(ProbePhase.DEPENDENCY_CHECKING);
//fall through straight into the dependency check
case DEPENDENCY_CHECKING:
if (checkAndReportDependencyProbes()) {
enterProbePhase(ProbePhase.BOOTSTRAPPING);
beginBootstrapProbes();
}
break;
case BOOTSTRAPPING:
if (checkAndReportBootstrapProbes()) {
enterProbePhase(ProbePhase.LIVE);
}
break;
case LIVE:
checkAndReportLiveProbes();
break;
case TERMINATING:
default:
//do nothing.
break;
}
}
/**
* Raise a probe failure; injecting the phase into the status result first
*
* @param status ping result
* @param text optional text -null or "" means "none"
* @return an exception ready to throw
*/
private ProbeFailedException raiseProbeFailure(ProbeStatus status, String text) {
status.setProbePhase(probePhase);
log.info("Probe failed: " + status);
return new ProbeFailedException(text, status);
}
@Override
public void run() {
int size = monitorProbes.size();
log.info("Probe Worker Starting; " + size + " probe" + MonitorUtils.toPlural(size) + ":");
enterProbePhase(ProbePhase.DEPENDENCY_CHECKING);
for (Probe probe : monitorProbes) {
log.info(probe.getName());
}
while (!mustExit) {
try {
Thread.sleep(interval);
executeProbePhases();
} catch (ProbeFailedException e) {
//relay to the inner loop handler
probeFailed(e);
} catch (InterruptedException interrupted) {
break;
} catch (ProbeInterruptedException e) {
//exit raised.
//this will be true, just making extra-sure
break;
}
}
log.info("Probe Worker Exiting");
enterProbePhase(ProbePhase.TERMINATING);
}
protected void probeFailed(ProbeFailedException e) {
reportHandler.probeFailure(e);
}
}