blob: 0279a6038de23a7e9d7a7b69860db1d1e3e9fd50 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.procedure;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.errorhandling.ForeignException;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionListener;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.errorhandling.TimeoutExceptionInjector;
import com.google.common.collect.Lists;
/**
* A globally-barriered distributed procedure. This class encapsulates state and methods for
* tracking and managing a distributed procedure, as well as aborting if any member encounters
* a problem or if a cancellation is requested.
* <p>
* All procedures first attempt to reach a barrier point with the {@link #sendGlobalBarrierStart()}
* method. The procedure contacts all members and waits for all subprocedures to execute
* {@link Subprocedure#acquireBarrier} to acquire its local piece of the global barrier and then
* send acquisition info back to the coordinator. If all acquisitions at subprocedures succeed,
* the coordinator then will call {@link #sendGlobalBarrierReached()}. This notifies members to
* execute the {@link Subprocedure#insideBarrier()} method. The procedure is blocked until all
* {@link Subprocedure#insideBarrier} executions complete at the members. When
* {@link Subprocedure#insideBarrier} completes at each member, the member sends notification to
* the coordinator. Once all members complete, the coordinator calls
* {@link #sendGlobalBarrierComplete()}.
* <p>
* If errors are encountered remotely, they are forwarded to the coordinator, and
* {@link Subprocedure#cleanup(Exception)} is called.
* <p>
* Each Procedure and each Subprocedure enforces a time limit on the execution time. If the time
* limit expires before the procedure completes the {@link TimeoutExceptionInjector} will trigger
* an {@link ForeignException} to abort the procedure. This is particularly useful for situations
* when running a distributed {@link Subprocedure} so participants can avoid blocking for extreme
* amounts of time if one of the participants fails or takes a really long time (e.g. GC pause).
* <p>
* Users should generally not directly create or subclass instances of this. They are created
* for them implicitly via {@link ProcedureCoordinator#startProcedure(ForeignExceptionDispatcher,
* String, byte[], List)}}
*/
@InterfaceAudience.Private
public class Procedure implements Callable<Void>, ForeignExceptionListener {
private static final Log LOG = LogFactory.getLog(Procedure.class);
//
// Arguments and naming
//
// Name of the procedure
final private String procName;
// Arguments for this procedure execution
final private byte[] args;
//
// Execution State
//
/** latch for waiting until all members have acquire in barrier state */
final CountDownLatch acquiredBarrierLatch;
/** latch for waiting until all members have executed and released their in barrier state */
final CountDownLatch releasedBarrierLatch;
/** latch for waiting until a procedure has completed */
final CountDownLatch completedLatch;
/** monitor to check for errors */
private final ForeignExceptionDispatcher monitor;
//
// Execution Timeout Handling.
//
/** frequency to check for errors (ms) */
protected final long wakeFrequency;
protected final TimeoutExceptionInjector timeoutInjector;
//
// Members' and Coordinator's state
//
/** lock to prevent nodes from acquiring and then releasing before we can track them */
private Object joinBarrierLock = new Object();
private final List<String> acquiringMembers;
private final List<String> inBarrierMembers;
private final HashMap<String, byte[]> dataFromFinishedMembers;
private ProcedureCoordinator coord;
/**
* Creates a procedure. (FOR TESTING)
*
* {@link Procedure} state to be run by a {@link ProcedureCoordinator}.
* @param coord coordinator to call back to for general errors (e.g.
* {@link ProcedureCoordinator#rpcConnectionFailure(String, IOException)}).
* @param monitor error monitor to check for external errors
* @param wakeFreq frequency to check for errors while waiting
* @param timeout amount of time to allow the procedure to run before cancelling
* @param procName name of the procedure instance
* @param args argument data associated with the procedure instance
* @param expectedMembers names of the expected members
*/
public Procedure(ProcedureCoordinator coord, ForeignExceptionDispatcher monitor, long wakeFreq,
long timeout, String procName, byte[] args, List<String> expectedMembers) {
this.coord = coord;
this.acquiringMembers = new ArrayList<String>(expectedMembers);
this.inBarrierMembers = new ArrayList<String>(acquiringMembers.size());
this.dataFromFinishedMembers = new HashMap<String, byte[]>();
this.procName = procName;
this.args = args;
this.monitor = monitor;
this.wakeFrequency = wakeFreq;
int count = expectedMembers.size();
this.acquiredBarrierLatch = new CountDownLatch(count);
this.releasedBarrierLatch = new CountDownLatch(count);
this.completedLatch = new CountDownLatch(1);
this.timeoutInjector = new TimeoutExceptionInjector(monitor, timeout);
}
/**
* Create a procedure.
*
* Users should generally not directly create instances of this. They are created them
* implicitly via {@link ProcedureCoordinator#createProcedure(ForeignExceptionDispatcher,
* String, byte[], List)}}
*
* @param coord coordinator to call back to for general errors (e.g.
* {@link ProcedureCoordinator#rpcConnectionFailure(String, IOException)}).
* @param wakeFreq frequency to check for errors while waiting
* @param timeout amount of time to allow the procedure to run before cancelling
* @param procName name of the procedure instance
* @param args argument data associated with the procedure instance
* @param expectedMembers names of the expected members
*/
public Procedure(ProcedureCoordinator coord, long wakeFreq, long timeout,
String procName, byte[] args, List<String> expectedMembers) {
this(coord, new ForeignExceptionDispatcher(), wakeFreq, timeout, procName, args,
expectedMembers);
}
public String getName() {
return procName;
}
/**
* @return String of the procedure members both trying to enter the barrier and already in barrier
*/
public String getStatus() {
String waiting, done;
synchronized (joinBarrierLock) {
waiting = acquiringMembers.toString();
done = inBarrierMembers.toString();
}
return "Procedure " + procName + " { waiting=" + waiting + " done="+ done + " }";
}
/**
* Get the ForeignExceptionDispatcher
* @return the Procedure's monitor.
*/
public ForeignExceptionDispatcher getErrorMonitor() {
return monitor;
}
/**
* This call is the main execution thread of the barriered procedure. It sends messages and
* essentially blocks until all procedure members acquire or later complete but periodically
* checks for foreign exceptions.
*/
@Override
@SuppressWarnings("finally")
final public Void call() {
LOG.info("Starting procedure '" + procName + "'");
// start the timer
timeoutInjector.start();
// run the procedure
try {
// start by checking for error first
monitor.rethrowException();
LOG.debug("Procedure '" + procName + "' starting 'acquire'");
sendGlobalBarrierStart();
// wait for all the members to report acquisition
LOG.debug("Waiting for all members to 'acquire'");
waitForLatch(acquiredBarrierLatch, monitor, wakeFrequency, "acquired");
monitor.rethrowException();
LOG.debug("Procedure '" + procName + "' starting 'in-barrier' execution.");
sendGlobalBarrierReached();
// wait for all members to report barrier release
LOG.debug("Waiting for all members to 'release'");
waitForLatch(releasedBarrierLatch, monitor, wakeFrequency, "released");
// make sure we didn't get an error during in barrier execution and release
monitor.rethrowException();
LOG.info("Procedure '" + procName + "' execution completed");
} catch (Exception e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
String msg = "Procedure '" + procName +"' execution failed!";
LOG.error(msg, e);
receive(new ForeignException(getName(), e));
} finally {
LOG.debug("Running finish phase.");
sendGlobalBarrierComplete();
completedLatch.countDown();
// tell the timer we are done, if we get here successfully
timeoutInjector.complete();
return null;
}
}
/**
* Sends a message to Members to create a new {@link Subprocedure} for this Procedure and execute
* the {@link Subprocedure#acquireBarrier} step.
* @throws ForeignException
*/
public void sendGlobalBarrierStart() throws ForeignException {
// start the procedure
LOG.debug("Starting procedure '" + procName + "', kicking off acquire phase on members.");
try {
// send procedure barrier start to specified list of members. cloning the list to avoid
// concurrent modification from the controller setting the prepared nodes
coord.getRpcs().sendGlobalBarrierAcquire(this, args, Lists.newArrayList(this.acquiringMembers));
} catch (IOException e) {
coord.rpcConnectionFailure("Can't reach controller.", e);
} catch (IllegalArgumentException e) {
throw new ForeignException(getName(), e);
}
}
/**
* Sends a message to all members that the global barrier condition has been satisfied. This
* should only be executed after all members have completed its
* {@link Subprocedure#acquireBarrier()} call successfully. This triggers the member
* {@link Subprocedure#insideBarrier} method.
* @throws ForeignException
*/
public void sendGlobalBarrierReached() throws ForeignException {
try {
// trigger to have member run {@link Subprocedure#insideBarrier}
coord.getRpcs().sendGlobalBarrierReached(this, Lists.newArrayList(inBarrierMembers));
} catch (IOException e) {
coord.rpcConnectionFailure("Can't reach controller.", e);
}
}
/**
* Sends a message to members that all {@link Subprocedure#insideBarrier} calls have completed.
* After this executes, the coordinator can assume that any state resources about this barrier
* procedure state has been released.
*/
public void sendGlobalBarrierComplete() {
LOG.debug("Finished coordinator procedure - removing self from list of running procedures");
try {
coord.getRpcs().resetMembers(this);
} catch (IOException e) {
coord.rpcConnectionFailure("Failed to reset procedure:" + procName, e);
}
}
//
// Call backs from other external processes.
//
/**
* Call back triggered by an individual member upon successful local barrier acquisition
* @param member
*/
public void barrierAcquiredByMember(String member) {
LOG.debug("member: '" + member + "' joining acquired barrier for procedure '" + procName
+ "' on coordinator");
if (this.acquiringMembers.contains(member)) {
synchronized (joinBarrierLock) {
if (this.acquiringMembers.remove(member)) {
this.inBarrierMembers.add(member);
acquiredBarrierLatch.countDown();
}
}
LOG.debug("Waiting on: " + acquiredBarrierLatch + " remaining members to acquire global barrier");
} else {
LOG.warn("Member " + member + " joined barrier, but we weren't waiting on it to join." +
" Continuing on.");
}
}
/**
* Call back triggered by a individual member upon successful local in-barrier execution and
* release
* @param member
* @param dataFromMember
*/
public void barrierReleasedByMember(String member, byte[] dataFromMember) {
boolean removed = false;
synchronized (joinBarrierLock) {
removed = this.inBarrierMembers.remove(member);
if (removed) {
releasedBarrierLatch.countDown();
}
}
if (removed) {
LOG.debug("Member: '" + member + "' released barrier for procedure'" + procName
+ "', counting down latch. Waiting for " + releasedBarrierLatch.getCount()
+ " more");
} else {
LOG.warn("Member: '" + member + "' released barrier for procedure'" + procName
+ "', but we weren't waiting on it to release!");
}
dataFromFinishedMembers.put(member, dataFromMember);
}
/**
* Waits until the entire procedure has globally completed, or has been aborted. If an
* exception is thrown the procedure may or not have run cleanup to trigger the completion latch
* yet.
* @throws ForeignException
* @throws InterruptedException
*/
public void waitForCompleted() throws ForeignException, InterruptedException {
waitForLatch(completedLatch, monitor, wakeFrequency, procName + " completed");
}
/**
* Waits until the entire procedure has globally completed, or has been aborted. If an
* exception is thrown the procedure may or not have run cleanup to trigger the completion latch
* yet.
* @return data returned from procedure members upon successfully completing subprocedure.
* @throws ForeignException
* @throws InterruptedException
*/
public HashMap<String, byte[]> waitForCompletedWithRet() throws ForeignException, InterruptedException {
waitForCompleted();
return dataFromFinishedMembers;
}
/**
* Check if the entire procedure has globally completed, or has been aborted.
* @throws ForeignException
*/
public boolean isCompleted() throws ForeignException {
// Rethrow exception if any
monitor.rethrowException();
return (completedLatch.getCount() == 0);
}
/**
* A callback that handles incoming ForeignExceptions.
*/
@Override
public void receive(ForeignException e) {
monitor.receive(e);
}
/**
* Wait for latch to count to zero, ignoring any spurious wake-ups, but waking periodically to
* check for errors
* @param latch latch to wait on
* @param monitor monitor to check for errors while waiting
* @param wakeFrequency frequency to wake up and check for errors (in
* {@link TimeUnit#MILLISECONDS})
* @param latchDescription description of the latch, for logging
* @throws ForeignException type of error the monitor can throw, if the task fails
* @throws InterruptedException if we are interrupted while waiting on latch
*/
public static void waitForLatch(CountDownLatch latch, ForeignExceptionSnare monitor,
long wakeFrequency, String latchDescription) throws ForeignException,
InterruptedException {
boolean released = false;
while (!released) {
if (monitor != null) {
monitor.rethrowException();
}
/*
ForeignExceptionDispatcher.LOG.debug("Waiting for '" + latchDescription + "' latch. (sleep:"
+ wakeFrequency + " ms)"); */
released = latch.await(wakeFrequency, TimeUnit.MILLISECONDS);
}
// check error again in case an error raised during last wait
if (monitor != null) {
monitor.rethrowException();
}
}
}