| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.hdfs; |
| |
| import java.io.IOException; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FSDataOutputStream; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.permission.FsPermission; |
| import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties; |
| import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; |
| import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil; |
| import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; |
| import org.apache.hadoop.io.IOUtils; |
| import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; |
| import org.apache.hadoop.hdfs.server.namenode.NameNode; |
| import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; |
| import org.apache.hadoop.test.GenericTestUtils; |
| |
| import static org.junit.Assert.*; |
| import org.junit.Before; |
| import org.junit.After; |
| import org.junit.Test; |
| |
| import com.google.common.base.Supplier; |
| import com.google.common.collect.Lists; |
| |
| /** |
| * Tests to verify safe mode correctness. |
| */ |
| public class TestSafeMode { |
| private static final Path TEST_PATH = new Path("/test"); |
| private static final int BLOCK_SIZE = 1024; |
| Configuration conf; |
| MiniDFSCluster cluster; |
| FileSystem fs; |
| DistributedFileSystem dfs; |
| |
| @Before |
| public void startUp() throws IOException { |
| conf = new HdfsConfiguration(); |
| conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE); |
| cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build(); |
| cluster.waitActive(); |
| fs = cluster.getFileSystem(); |
| dfs = (DistributedFileSystem)fs; |
| } |
| |
| @After |
| public void tearDown() throws IOException { |
| if (fs != null) { |
| fs.close(); |
| } |
| if (cluster != null) { |
| cluster.shutdown(); |
| } |
| } |
| |
| /** |
| * This test verifies that if SafeMode is manually entered, name-node does not |
| * come out of safe mode even after the startup safe mode conditions are met. |
| * <ol> |
| * <li>Start cluster with 1 data-node.</li> |
| * <li>Create 2 files with replication 1.</li> |
| * <li>Re-start cluster with 0 data-nodes. |
| * Name-node should stay in automatic safe-mode.</li> |
| * <li>Enter safe mode manually.</li> |
| * <li>Start the data-node.</li> |
| * <li>Wait longer than <tt>dfs.namenode.safemode.extension</tt> and |
| * verify that the name-node is still in safe mode.</li> |
| * </ol> |
| * |
| * @throws IOException |
| */ |
| @Test |
| public void testManualSafeMode() throws IOException { |
| fs = (DistributedFileSystem)cluster.getFileSystem(); |
| Path file1 = new Path("/tmp/testManualSafeMode/file1"); |
| Path file2 = new Path("/tmp/testManualSafeMode/file2"); |
| |
| // create two files with one block each. |
| DFSTestUtil.createFile(fs, file1, 1000, (short)1, 0); |
| DFSTestUtil.createFile(fs, file2, 1000, (short)1, 0); |
| fs.close(); |
| cluster.shutdown(); |
| |
| // now bring up just the NameNode. |
| cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0).format(false).build(); |
| cluster.waitActive(); |
| dfs = (DistributedFileSystem)cluster.getFileSystem(); |
| |
| assertTrue("No datanode is started. Should be in SafeMode", |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); |
| |
| // manually set safemode. |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER); |
| |
| // now bring up the datanode and wait for it to be active. |
| cluster.startDataNodes(conf, 1, true, null, null); |
| cluster.waitActive(); |
| |
| // wait longer than dfs.namenode.safemode.extension |
| try { |
| Thread.sleep(2000); |
| } catch (InterruptedException ignored) {} |
| |
| assertTrue("should still be in SafeMode", |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); |
| assertFalse("should not be in SafeMode", |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE)); |
| } |
| |
| /** |
| * Test that, if there are no blocks in the filesystem, |
| * the NameNode doesn't enter the "safemode extension" period. |
| */ |
| @Test(timeout=45000) |
| public void testNoExtensionIfNoBlocks() throws IOException { |
| cluster.getConfiguration(0).setInt( |
| DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 60000); |
| cluster.restartNameNode(); |
| // Even though we have safemode extension set high, we should immediately |
| // exit safemode on startup because there are no blocks in the namespace. |
| String status = cluster.getNameNode().getNamesystem().getSafemode(); |
| assertEquals("", status); |
| } |
| |
| /** |
| * Test that the NN initializes its under-replicated blocks queue |
| * before it is ready to exit safemode (HDFS-1476) |
| */ |
| @Test(timeout=45000) |
| public void testInitializeReplQueuesEarly() throws Exception { |
| // Spray the blocks around the cluster when we add DNs instead of |
| // concentrating all blocks on the first node. |
| BlockManagerTestUtil.setWritingPrefersLocalNode( |
| cluster.getNamesystem().getBlockManager(), false); |
| |
| cluster.startDataNodes(conf, 2, true, StartupOption.REGULAR, null); |
| cluster.waitActive(); |
| DFSTestUtil.createFile(fs, TEST_PATH, 15*BLOCK_SIZE, (short)1, 1L); |
| |
| |
| List<DataNodeProperties> dnprops = Lists.newLinkedList(); |
| dnprops.add(cluster.stopDataNode(0)); |
| dnprops.add(cluster.stopDataNode(0)); |
| dnprops.add(cluster.stopDataNode(0)); |
| |
| cluster.getConfiguration(0).setFloat( |
| DFSConfigKeys.DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY, 1f/15f); |
| |
| cluster.restartNameNode(); |
| final NameNode nn = cluster.getNameNode(); |
| |
| String status = nn.getNamesystem().getSafemode(); |
| assertEquals("Safe mode is ON.The reported blocks 0 needs additional " + |
| "15 blocks to reach the threshold 0.9990 of total blocks 15. " + |
| "Safe mode will be turned off automatically.", status); |
| assertFalse("Mis-replicated block queues should not be initialized " + |
| "until threshold is crossed", |
| NameNodeAdapter.safeModeInitializedReplQueues(nn)); |
| |
| cluster.restartDataNode(dnprops.remove(0)); |
| |
| // Wait for the block report from the restarted DN to come in. |
| GenericTestUtils.waitFor(new Supplier<Boolean>() { |
| @Override |
| public Boolean get() { |
| return NameNodeAdapter.getSafeModeSafeBlocks(nn) > 0; |
| } |
| }, 10, 10000); |
| // SafeMode is fine-grain synchronized, so the processMisReplicatedBlocks |
| // call is still going on at this point - wait until it's done by grabbing |
| // the lock. |
| nn.getNamesystem().writeLock(); |
| nn.getNamesystem().writeUnlock(); |
| int safe = NameNodeAdapter.getSafeModeSafeBlocks(nn); |
| assertTrue("Expected first block report to make some but not all blocks " + |
| "safe. Got: " + safe, safe >= 1 && safe < 15); |
| BlockManagerTestUtil.updateState(nn.getNamesystem().getBlockManager()); |
| |
| assertTrue(NameNodeAdapter.safeModeInitializedReplQueues(nn)); |
| assertEquals(15 - safe, nn.getNamesystem().getUnderReplicatedBlocks()); |
| |
| cluster.restartDataNodes(); |
| } |
| |
| /** |
| * Test that, when under-replicated blocks are processed at the end of |
| * safe-mode, blocks currently under construction are not considered |
| * under-construction or missing. Regression test for HDFS-2822. |
| */ |
| @Test |
| public void testRbwBlocksNotConsideredUnderReplicated() throws IOException { |
| List<FSDataOutputStream> stms = Lists.newArrayList(); |
| try { |
| // Create some junk blocks so that the NN doesn't just immediately |
| // exit safemode on restart. |
| DFSTestUtil.createFile(fs, new Path("/junk-blocks"), |
| BLOCK_SIZE*4, (short)1, 1L); |
| // Create several files which are left open. It's important to |
| // create several here, because otherwise the first iteration of the |
| // replication monitor will pull them off the replication queue and |
| // hide this bug from the test! |
| for (int i = 0; i < 10; i++) { |
| FSDataOutputStream stm = fs.create( |
| new Path("/append-" + i), true, BLOCK_SIZE, (short) 1, BLOCK_SIZE); |
| stms.add(stm); |
| stm.write(1); |
| stm.hflush(); |
| } |
| |
| cluster.restartNameNode(); |
| FSNamesystem ns = cluster.getNameNode(0).getNamesystem(); |
| BlockManagerTestUtil.updateState(ns.getBlockManager()); |
| assertEquals(0, ns.getPendingReplicationBlocks()); |
| assertEquals(0, ns.getCorruptReplicaBlocks()); |
| assertEquals(0, ns.getMissingBlocksCount()); |
| |
| } finally { |
| for (FSDataOutputStream stm : stms) { |
| IOUtils.closeStream(stm); |
| } |
| cluster.shutdown(); |
| } |
| } |
| |
| public interface FSRun { |
| public abstract void run(FileSystem fs) throws IOException; |
| } |
| |
| /** |
| * Assert that the given function fails to run due to a safe |
| * mode exception. |
| */ |
| public void runFsFun(String msg, FSRun f) { |
| try { |
| f.run(fs); |
| fail(msg); |
| } catch (IOException ioe) { |
| assertTrue(ioe.getMessage().contains("safe mode")); |
| } |
| } |
| |
| /** |
| * Run various fs operations while the NN is in safe mode, |
| * assert that they are either allowed or fail as expected. |
| */ |
| @Test |
| public void testOperationsWhileInSafeMode() throws IOException { |
| final Path file1 = new Path("/file1"); |
| |
| assertFalse(dfs.setSafeMode(SafeModeAction.SAFEMODE_GET)); |
| DFSTestUtil.createFile(fs, file1, 1024, (short)1, 0); |
| assertTrue("Could not enter SM", |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER)); |
| |
| runFsFun("Set quota while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| ((DistributedFileSystem)fs).setQuota(file1, 1, 1); |
| }}); |
| |
| runFsFun("Set perm while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| fs.setPermission(file1, FsPermission.getDefault()); |
| }}); |
| |
| runFsFun("Set owner while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| fs.setOwner(file1, "user", "group"); |
| }}); |
| |
| runFsFun("Set repl while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| fs.setReplication(file1, (short)1); |
| }}); |
| |
| runFsFun("Append file while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| DFSTestUtil.appendFile(fs, file1, "new bytes"); |
| }}); |
| |
| runFsFun("Delete file while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| fs.delete(file1, false); |
| }}); |
| |
| runFsFun("Rename file while in SM", new FSRun() { |
| public void run(FileSystem fs) throws IOException { |
| fs.rename(file1, new Path("file2")); |
| }}); |
| |
| try { |
| fs.setTimes(file1, 0, 0); |
| } catch (IOException ioe) { |
| fail("Set times failed while in SM"); |
| } |
| |
| try { |
| DFSTestUtil.readFile(fs, file1); |
| } catch (IOException ioe) { |
| fail("Set times failed while in SM"); |
| } |
| |
| assertFalse("Could not leave SM", |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE)); |
| } |
| |
| /** |
| * Verify that the NameNode stays in safemode when dfs.safemode.datanode.min |
| * is set to a number greater than the number of live datanodes. |
| */ |
| @Test |
| public void testDatanodeThreshold() throws IOException { |
| cluster.shutdown(); |
| Configuration conf = cluster.getConfiguration(0); |
| conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0); |
| conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1); |
| |
| cluster.restartNameNode(); |
| fs = (DistributedFileSystem)cluster.getFileSystem(); |
| |
| String tipMsg = cluster.getNamesystem().getSafemode(); |
| assertTrue("Safemode tip message looks right: " + tipMsg, |
| tipMsg.contains("The number of live datanodes 0 needs an additional " + |
| "1 live datanodes to reach the minimum number 1. " + |
| "Safe mode will be turned off automatically.")); |
| |
| // Start a datanode |
| cluster.startDataNodes(conf, 1, true, null, null); |
| |
| // Wait long enough for safemode check to refire |
| try { |
| Thread.sleep(1000); |
| } catch (InterruptedException ignored) {} |
| |
| // We now should be out of safe mode. |
| assertEquals("", cluster.getNamesystem().getSafemode()); |
| } |
| |
| /* |
| * Tests some utility methods that surround the SafeMode's state. |
| * @throws IOException when there's an issue connecting to the test DFS. |
| */ |
| public void testSafeModeUtils() throws IOException { |
| dfs = (DistributedFileSystem)cluster.getFileSystem(); |
| |
| // Enter safemode. |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER); |
| assertTrue("State was expected to be in safemode.", dfs.isInSafeMode()); |
| |
| // Exit safemode. |
| dfs.setSafeMode(SafeModeAction.SAFEMODE_LEAVE); |
| assertFalse("State was expected to be out of safemode.", dfs.isInSafeMode()); |
| } |
| } |