| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.ozone.om; |
| |
| import org.apache.commons.lang3.RandomStringUtils; |
| import org.apache.hadoop.hdds.HddsConfigKeys; |
| import org.apache.hadoop.hdds.conf.OzoneConfiguration; |
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos; |
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleEvent; |
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor; |
| import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; |
| import org.apache.hadoop.hdds.scm.container.ContainerInfo; |
| import org.apache.hadoop.hdds.scm.container.SCMContainerManager; |
| import org.apache.hadoop.hdds.scm.events.SCMEvents; |
| import org.apache.hadoop.hdds.scm.exceptions.SCMException; |
| import org.apache.hadoop.hdds.scm.protocolPB.StorageContainerLocationProtocolClientSideTranslatorPB; |
| import org.apache.hadoop.hdds.scm.safemode.SCMSafeModeManager; |
| import org.apache.hadoop.hdds.scm.server.SCMClientProtocolServer; |
| import org.apache.hadoop.hdds.scm.server.StorageContainerManager; |
| import org.apache.hadoop.hdds.server.events.EventQueue; |
| import org.apache.hadoop.ozone.HddsDatanodeService; |
| import org.apache.hadoop.ozone.MiniOzoneCluster; |
| import org.apache.hadoop.ozone.OzoneConfigKeys; |
| import org.apache.hadoop.ozone.TestStorageContainerManagerHelper; |
| import org.apache.hadoop.ozone.client.ObjectStore; |
| import org.apache.hadoop.ozone.client.OzoneBucket; |
| import org.apache.hadoop.ozone.client.OzoneVolume; |
| import org.apache.hadoop.ozone.om.helpers.OmKeyInfo; |
| import org.apache.hadoop.test.GenericTestUtils; |
| import org.apache.hadoop.test.LambdaTestUtils; |
| import org.junit.After; |
| import org.junit.Assert; |
| import org.junit.Before; |
| import org.junit.Ignore; |
| import org.junit.Rule; |
| import org.junit.Test; |
| import org.junit.rules.Timeout; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| import static org.apache.hadoop.hdds.client.ReplicationType.RATIS; |
| import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE; |
| import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL; |
| import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertTrue; |
| import static org.junit.Assert.fail; |
| |
| /** |
| * Test Ozone Manager operation in distributed handler scenario. |
| */ |
| @Ignore |
| public class TestScmSafeMode { |
| |
| private final static Logger LOG = LoggerFactory |
| .getLogger(TestScmSafeMode.class); |
| private static MiniOzoneCluster cluster = null; |
| private static MiniOzoneCluster.Builder builder = null; |
| private static OzoneConfiguration conf; |
| private static OzoneManager om; |
| private static StorageContainerLocationProtocolClientSideTranslatorPB |
| storageContainerLocationClient; |
| |
| |
| @Rule |
| public Timeout timeout = new Timeout(1000 * 200); |
| |
| /** |
| * Create a MiniDFSCluster for testing. |
| * <p> |
| * Ozone is made active by setting OZONE_ENABLED = true and |
| * OZONE_HANDLER_TYPE_KEY = "distributed" |
| * |
| * @throws IOException |
| */ |
| @Before |
| public void init() throws Exception { |
| conf = new OzoneConfiguration(); |
| conf.set(OZONE_SCM_STALENODE_INTERVAL, "10s"); |
| conf.set(OZONE_SCM_DEADNODE_INTERVAL, "25s"); |
| builder = MiniOzoneCluster.newBuilder(conf) |
| .setHbInterval(1000) |
| .setHbProcessorInterval(500) |
| .setStartDataNodes(false); |
| cluster = builder.build(); |
| cluster.startHddsDatanodes(); |
| cluster.waitForClusterToBeReady(); |
| om = cluster.getOzoneManager(); |
| storageContainerLocationClient = cluster |
| .getStorageContainerLocationClient(); |
| } |
| |
| /** |
| * Shutdown MiniDFSCluster. |
| */ |
| @After |
| public void shutdown() { |
| if (cluster != null) { |
| try { |
| cluster.shutdown(); |
| } catch (Exception e) { |
| // do nothing. |
| } |
| } |
| } |
| |
| @Test(timeout = 300_000) |
| public void testSafeModeOperations() throws Exception { |
| // Create {numKeys} random names keys. |
| TestStorageContainerManagerHelper helper = |
| new TestStorageContainerManagerHelper(cluster, conf); |
| Map<String, OmKeyInfo> keyLocations = helper.createKeys(100, 4096); |
| final List<ContainerInfo> containers = cluster |
| .getStorageContainerManager().getContainerManager().getContainers(); |
| GenericTestUtils.waitFor(() -> containers.size() >= 3, 100, 1000); |
| |
| String volumeName = "volume" + RandomStringUtils.randomNumeric(5); |
| String bucketName = "bucket" + RandomStringUtils.randomNumeric(5); |
| String keyName = "key" + RandomStringUtils.randomNumeric(5); |
| |
| ObjectStore store = cluster.getRpcClient().getObjectStore(); |
| store.createVolume(volumeName); |
| OzoneVolume volume = store.getVolume(volumeName); |
| volume.createBucket(bucketName); |
| OzoneBucket bucket = volume.getBucket(bucketName); |
| bucket.createKey(keyName, 1000, RATIS, ONE, new HashMap<>()); |
| |
| cluster.stop(); |
| |
| try { |
| cluster = builder.build(); |
| } catch (IOException e) { |
| fail("failed"); |
| } |
| |
| |
| StorageContainerManager scm; |
| |
| scm = cluster.getStorageContainerManager(); |
| Assert.assertTrue(scm.isInSafeMode()); |
| |
| om = cluster.getOzoneManager(); |
| |
| |
| final OzoneBucket bucket1 = |
| cluster.getRpcClient().getObjectStore().getVolume(volumeName) |
| .getBucket(bucketName); |
| |
| // As cluster is restarted with out datanodes restart |
| LambdaTestUtils.intercept(IOException.class, |
| "SafeModePrecheck failed for allocateBlock", |
| () -> bucket1.createKey(keyName, 1000, RATIS, ONE, |
| new HashMap<>())); |
| } |
| |
| /** |
| * Tests inSafeMode & forceExitSafeMode api calls. |
| */ |
| @Test(timeout = 300_000) |
| public void testIsScmInSafeModeAndForceExit() throws Exception { |
| // Test 1: SCM should be out of safe mode. |
| Assert.assertFalse(storageContainerLocationClient.inSafeMode()); |
| cluster.stop(); |
| // Restart the cluster with same metadata dir. |
| |
| try { |
| cluster = builder.build(); |
| } catch (IOException e) { |
| Assert.fail("Cluster startup failed."); |
| } |
| |
| // Test 2: Scm should be in safe mode as datanodes are not started yet. |
| storageContainerLocationClient = cluster |
| .getStorageContainerLocationClient(); |
| Assert.assertTrue(storageContainerLocationClient.inSafeMode()); |
| // Force scm out of safe mode. |
| cluster.getStorageContainerManager().getClientProtocolServer() |
| .forceExitSafeMode(); |
| // Test 3: SCM should be out of safe mode. |
| GenericTestUtils.waitFor(() -> { |
| try { |
| return !cluster.getStorageContainerManager().getClientProtocolServer() |
| .inSafeMode(); |
| } catch (IOException e) { |
| Assert.fail("Cluster"); |
| return false; |
| } |
| }, 10, 1000 * 5); |
| |
| } |
| |
| @Test(timeout = 300_000) |
| public void testSCMSafeMode() throws Exception { |
| // Test1: Test safe mode when there are no containers in system. |
| cluster.stop(); |
| |
| try { |
| cluster = builder.build(); |
| } catch (IOException e) { |
| Assert.fail("Cluster startup failed."); |
| } |
| assertTrue(cluster.getStorageContainerManager().isInSafeMode()); |
| cluster.startHddsDatanodes(); |
| cluster.waitForClusterToBeReady(); |
| cluster.waitTobeOutOfSafeMode(); |
| assertFalse(cluster.getStorageContainerManager().isInSafeMode()); |
| |
| // Test2: Test safe mode when containers are there in system. |
| // Create {numKeys} random names keys. |
| TestStorageContainerManagerHelper helper = |
| new TestStorageContainerManagerHelper(cluster, conf); |
| Map<String, OmKeyInfo> keyLocations = helper.createKeys(100 * 2, 4096); |
| final List<ContainerInfo> containers = cluster |
| .getStorageContainerManager().getContainerManager().getContainers(); |
| GenericTestUtils.waitFor(() -> containers.size() >= 3, 100, 1000 * 30); |
| |
| // Removing some container to keep them open. |
| containers.remove(0); |
| containers.remove(0); |
| |
| // Close remaining containers |
| SCMContainerManager mapping = (SCMContainerManager) cluster |
| .getStorageContainerManager().getContainerManager(); |
| containers.forEach(c -> { |
| try { |
| mapping.updateContainerState(c.containerID(), |
| HddsProtos.LifeCycleEvent.FINALIZE); |
| mapping.updateContainerState(c.containerID(), |
| LifeCycleEvent.CLOSE); |
| } catch (IOException e) { |
| LOG.info("Failed to change state of open containers.", e); |
| } |
| }); |
| cluster.stop(); |
| |
| GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer |
| .captureLogs(SCMSafeModeManager.getLogger()); |
| logCapturer.clearOutput(); |
| |
| try { |
| cluster = builder.build(); |
| } catch (IOException ex) { |
| fail("failed"); |
| } |
| |
| StorageContainerManager scm; |
| |
| scm = cluster.getStorageContainerManager(); |
| assertTrue(scm.isInSafeMode()); |
| assertFalse(logCapturer.getOutput().contains("SCM exiting safe mode.")); |
| assertTrue(scm.getCurrentContainerThreshold() == 0); |
| for (HddsDatanodeService dn : cluster.getHddsDatanodes()) { |
| dn.start(); |
| } |
| GenericTestUtils |
| .waitFor(() -> scm.getCurrentContainerThreshold() == 1.0, 100, 20000); |
| |
| EventQueue eventQueue = |
| (EventQueue) cluster.getStorageContainerManager().getEventQueue(); |
| eventQueue.processAll(5000L); |
| |
| double safeModeCutoff = conf |
| .getDouble(HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT, |
| HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); |
| assertTrue(scm.getCurrentContainerThreshold() >= safeModeCutoff); |
| assertTrue(logCapturer.getOutput().contains("SCM exiting safe mode.")); |
| assertFalse(scm.isInSafeMode()); |
| } |
| |
| @Test(timeout = 300_000) |
| public void testSCMSafeModeRestrictedOp() throws Exception { |
| conf.set(OzoneConfigKeys.OZONE_METADATA_STORE_IMPL, |
| OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_LEVELDB); |
| cluster.stop(); |
| cluster = builder.build(); |
| StorageContainerManager scm = cluster.getStorageContainerManager(); |
| assertTrue(scm.isInSafeMode()); |
| |
| LambdaTestUtils.intercept(SCMException.class, |
| "SafeModePrecheck failed for allocateContainer", () -> { |
| scm.getClientProtocolServer() |
| .allocateContainer(ReplicationType.STAND_ALONE, |
| ReplicationFactor.ONE, ""); |
| }); |
| |
| cluster.startHddsDatanodes(); |
| cluster.waitForClusterToBeReady(); |
| cluster.waitTobeOutOfSafeMode(); |
| assertFalse(scm.isInSafeMode()); |
| |
| TestStorageContainerManagerHelper helper = |
| new TestStorageContainerManagerHelper(cluster, conf); |
| helper.createKeys(10, 4096); |
| SCMClientProtocolServer clientProtocolServer = cluster |
| .getStorageContainerManager().getClientProtocolServer(); |
| assertFalse((scm.getClientProtocolServer()).getSafeModeStatus()); |
| final List<ContainerInfo> containers = scm.getContainerManager() |
| .getContainers(); |
| scm.getEventQueue().fireEvent(SCMEvents.SAFE_MODE_STATUS, |
| new SCMSafeModeManager.SafeModeStatus(true, true)); |
| GenericTestUtils.waitFor(() -> { |
| return clientProtocolServer.getSafeModeStatus(); |
| }, 50, 1000 * 30); |
| assertTrue(clientProtocolServer.getSafeModeStatus()); |
| |
| cluster.shutdownHddsDatanodes(); |
| Thread.sleep(30000); |
| LambdaTestUtils.intercept(SCMException.class, |
| "Open container " + containers.get(0).getContainerID() + " " |
| + "doesn't have enough replicas to service this operation in Safe" |
| + " mode.", () -> clientProtocolServer |
| .getContainerWithPipeline(containers.get(0).getContainerID())); |
| } |
| |
| @Test(timeout = 300_000) |
| public void testSCMSafeModeDisabled() throws Exception { |
| cluster.shutdown(); |
| |
| // If safe mode is disabled, cluster should not be in safe mode even if |
| // min number of datanodes are not started. |
| conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false); |
| conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 3); |
| builder = MiniOzoneCluster.newBuilder(conf) |
| .setHbInterval(1000) |
| .setHbProcessorInterval(500) |
| .setNumDatanodes(3); |
| cluster = builder.build(); |
| StorageContainerManager scm = cluster.getStorageContainerManager(); |
| assertFalse(scm.isInSafeMode()); |
| |
| // Even on SCM restart, cluster should be out of safe mode immediately. |
| cluster.restartStorageContainerManager(true); |
| assertFalse(scm.isInSafeMode()); |
| } |
| } |