blob: 2b2a4d7a11e0cdda99711c0f9ea821e38ec9cc8e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ozone;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.ozone.om.OzoneManager;
/**
* This class causes random failures in OMs in the chaos cluster.
*/
public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {
// Cluster is deemed ready for chaos when all the OMs are up and running.
private AtomicBoolean isClusterReady = new AtomicBoolean(true);
// The maximum number of nodes failures which can be tolerated without
// losing quorum. This should be equal to (Num of OMs - 1)/2.
private int numOfOMNodeFailuresTolerated;
MiniOzoneOMChaosCluster(OzoneConfiguration conf,
List<OzoneManager> ozoneManagers,
StorageContainerManager scm,
List<HddsDatanodeService> hddsDatanodes,
String omServiceID) {
super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
FailureService.OZONE_MANAGER);
setNumNodes(ozoneManagers.size());
numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
}
/**
* Check if cluster is ready for a restart or shutdown of an OM node. If
* yes, then set isClusterReady to false so that another thread cannot
* restart/ shutdown OM till all OMs are up again.
*/
protected boolean isClusterReady() {
return isClusterReady.compareAndSet(true, false);
}
/**
* If any OM node is not running, restart it.
*/
@Override
protected void getClusterReady() {
boolean clusterReady = true;
for (OzoneManager om : getOzoneManagersList()) {
if (!om.isRunning()) {
try {
restartOzoneManager(om, true);
} catch (Exception e) {
clusterReady = false;
LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
om.getOMNodeId(), e);
}
}
}
if (clusterReady) {
isClusterReady.set(true);
}
}
@Override
protected int getNumberOfNodesToFail() {
return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
}
@Override
protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
throws IOException, TimeoutException, InterruptedException {
shutdownOzoneManager(failedNodeIndex);
restartOzoneManager(failedNodeIndex, waitForNodeRestart);
getClusterReady();
}
/**
* For OM chaos, a shutdown node should eventually be restarted before the
* next failure.
*/
@Override
protected void shutdownNode(int failedNodeIndex)
throws ExecutionException, InterruptedException {
shutdownOzoneManager(failedNodeIndex);
// Restart the OM after FailureInterval / 2 duration.
Executors.newSingleThreadScheduledExecutor().schedule(
this::getClusterReady, getFailureIntervalInMS() / 2,
TimeUnit.MILLISECONDS).get();
}
@Override
protected String getFailedNodeID(int failedNodeIndex) {
return getOzoneManager(failedNodeIndex).getOMNodeId();
}
/**
* When restarting OM, always wait for it to catch up with Leader OM.
*/
@Override
protected boolean isFastRestart() {
return true;
}
@Override
protected boolean shouldStop() {
return true;
}
}