hadoop-ozone/fault-injection-test/mini-chaos-tests/src/test/java/org/apache/hadoop/ozone/MiniOzoneOMChaosCluster.java - ozone - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.ozone;

 import java.io.IOException;
 import java.util.List;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;

 import org.apache.commons.lang3.RandomUtils;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
 import org.apache.hadoop.ozone.om.OzoneManager;

 /**
  * This class causes random failures in OMs in the chaos cluster.
  */
 public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {

   // Cluster is deemed ready for chaos when all the OMs are up and running.
   private AtomicBoolean isClusterReady = new AtomicBoolean(true);

   // The maximum number of nodes failures which can be tolerated without
   // losing quorum. This should be equal to (Num of OMs - 1)/2.
   private int numOfOMNodeFailuresTolerated;

   MiniOzoneOMChaosCluster(OzoneConfiguration conf,
       List<OzoneManager> ozoneManagers,
       StorageContainerManager scm,
       List<HddsDatanodeService> hddsDatanodes,
       String omServiceID) {
     super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
         FailureService.OZONE_MANAGER);
     setNumNodes(ozoneManagers.size());
     numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
   }

   /**
    * Check if cluster is ready for a restart or shutdown of an OM node. If
    * yes, then set isClusterReady to false so that another thread cannot
    * restart/ shutdown OM till all OMs are up again.
    */
   protected boolean isClusterReady() {
     return isClusterReady.compareAndSet(true, false);
   }

   /**
    * If any OM node is not running, restart it.
    */
   @Override
   protected void getClusterReady()  {
     boolean clusterReady = true;
     for (OzoneManager om : getOzoneManagersList()) {
       if (!om.isRunning()) {
         try {
           restartOzoneManager(om, true);
         } catch (Exception e) {
           clusterReady = false;
           LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
               om.getOMNodeId(), e);
         }
       }
     }
     if (clusterReady) {
       isClusterReady.set(true);
     }
   }

   @Override
   protected int getNumberOfNodesToFail() {
     return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
   }

   @Override
   protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
       throws IOException, TimeoutException, InterruptedException {
     shutdownOzoneManager(failedNodeIndex);
     restartOzoneManager(failedNodeIndex, waitForNodeRestart);
     getClusterReady();
   }

   /**
    * For OM chaos, a shutdown node should eventually be restarted before the
    * next failure.
    */
   @Override
   protected void shutdownNode(int failedNodeIndex)
       throws ExecutionException, InterruptedException {
     shutdownOzoneManager(failedNodeIndex);

     // Restart the OM after FailureInterval / 2 duration.
     Executors.newSingleThreadScheduledExecutor().schedule(
         this::getClusterReady, getFailureIntervalInMS() / 2,
         TimeUnit.MILLISECONDS).get();
   }

   @Override
   protected String getFailedNodeID(int failedNodeIndex) {
     return getOzoneManager(failedNodeIndex).getOMNodeId();
   }

   /**
    * When restarting OM, always wait for it to catch up with Leader OM.
    */
   @Override
   protected boolean isFastRestart() {
     return true;
   }

   @Override
   protected boolean shouldStop() {
     return true;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.ozone;

	import java.io.IOException;
	import java.util.List;
	import java.util.concurrent.ExecutionException;
	import java.util.concurrent.Executors;
	import java.util.concurrent.TimeUnit;
	import java.util.concurrent.TimeoutException;
	import java.util.concurrent.atomic.AtomicBoolean;

	import org.apache.commons.lang3.RandomUtils;
	import org.apache.hadoop.hdds.conf.OzoneConfiguration;
	import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
	import org.apache.hadoop.ozone.om.OzoneManager;

	/**
	* This class causes random failures in OMs in the chaos cluster.
	*/
	public class MiniOzoneOMChaosCluster extends MiniOzoneChaosCluster {

	// Cluster is deemed ready for chaos when all the OMs are up and running.
	private AtomicBoolean isClusterReady = new AtomicBoolean(true);

	// The maximum number of nodes failures which can be tolerated without
	// losing quorum. This should be equal to (Num of OMs - 1)/2.
	private int numOfOMNodeFailuresTolerated;

	MiniOzoneOMChaosCluster(OzoneConfiguration conf,
	List<OzoneManager> ozoneManagers,
	StorageContainerManager scm,
	List<HddsDatanodeService> hddsDatanodes,
	String omServiceID) {
	super(conf, ozoneManagers, scm, hddsDatanodes, omServiceID,
	FailureService.OZONE_MANAGER);
	setNumNodes(ozoneManagers.size());
	numOfOMNodeFailuresTolerated = (getNumNodes() - 1) / 2;
	}

	/**
	* Check if cluster is ready for a restart or shutdown of an OM node. If
	* yes, then set isClusterReady to false so that another thread cannot
	* restart/ shutdown OM till all OMs are up again.
	*/
	protected boolean isClusterReady() {
	return isClusterReady.compareAndSet(true, false);
	}

	/**
	* If any OM node is not running, restart it.
	*/
	@Override
	protected void getClusterReady() {
	boolean clusterReady = true;
	for (OzoneManager om : getOzoneManagersList()) {
	if (!om.isRunning()) {
	try {
	restartOzoneManager(om, true);
	} catch (Exception e) {
	clusterReady = false;
	LOG.error("Cluster not ready for chaos. Failed to restart OM {}: {}",
	om.getOMNodeId(), e);
	}
	}
	}
	if (clusterReady) {
	isClusterReady.set(true);
	}
	}

	@Override
	protected int getNumberOfNodesToFail() {
	return RandomUtils.nextInt(1, numOfOMNodeFailuresTolerated + 1);
	}

	@Override
	protected void restartNode(int failedNodeIndex, boolean waitForNodeRestart)
	throws IOException, TimeoutException, InterruptedException {
	shutdownOzoneManager(failedNodeIndex);
	restartOzoneManager(failedNodeIndex, waitForNodeRestart);
	getClusterReady();
	}

	/**
	* For OM chaos, a shutdown node should eventually be restarted before the
	* next failure.
	*/
	@Override
	protected void shutdownNode(int failedNodeIndex)
	throws ExecutionException, InterruptedException {
	shutdownOzoneManager(failedNodeIndex);

	// Restart the OM after FailureInterval / 2 duration.
	Executors.newSingleThreadScheduledExecutor().schedule(
	this::getClusterReady, getFailureIntervalInMS() / 2,
	TimeUnit.MILLISECONDS).get();
	}

	@Override
	protected String getFailedNodeID(int failedNodeIndex) {
	return getOzoneManager(failedNodeIndex).getOMNodeId();
	}

	/**
	* When restarting OM, always wait for it to catch up with Leader OM.
	*/
	@Override
	protected boolean isFastRestart() {
	return true;
	}

	@Override
	protected boolean shouldStop() {
	return true;
	}
	}