Merge trunk into HA branch.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1296485 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt
new file mode 100644
index 0000000..748ff93
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt
@@ -0,0 +1,53 @@
+Changes for HDFS-1623 branch.
+
+This change list will be merged into the trunk CHANGES.txt when the HDFS-1623
+branch is merged.
+------------------------------
+
+HADOOP-7455. HA: Introduce HA Service Protocol Interface. (suresh)
+
+HADOOP-7774. HA: Administrative CLI to control HA daemons. (todd)
+
+HADOOP-7896. HA: if both NNs are in Standby mode, client needs to try failing
+back and forth several times with sleeps. (atm)
+
+HADOOP-7922. Improve some logging for client IPC failovers and
+StandbyExceptions (todd)
+
+HADOOP-7921. StandbyException should extend IOException (todd)
+
+HADOOP-7928. HA: Client failover policy is incorrectly trying to fail over all
+IOExceptions (atm)
+
+HADOOP-7925. Add interface and update CLI to query current state to
+HAServiceProtocol (eli via todd)
+
+HADOOP-7932. Make client connection retries on socket time outs configurable.
+(Uma Maheswara Rao G via todd)
+
+HADOOP-7924. FailoverController for client-based configuration (eli)
+
+HADOOP-7961. Move HA fencing to common. (eli)
+
+HADOOP-7970. HAServiceProtocol methods must throw IOException.
+(Hari Mankude via suresh).
+
+HADOOP-7992. Add ZKClient library to facilitate leader election.
+(Bikas Saha via suresh).
+
+HADOOP-7983. HA: failover should be able to pass args to fencers. (eli)
+
+HADOOP-7938. HA: the FailoverController should optionally fence the
+active during failover. (eli)
+
+HADOOP-7991. HA: the FailoverController should check the standby is
+ready before failing over. (eli)
+
+HADOOP-8038. Add 'ipc.client.connect.max.retries.on.timeouts' entry in
+core-default.xml file. (Uma Maheswara Rao G via atm)
+
+HADOOP-8041. Log a warning when a failover is first attempted (todd)
+
+HADOOP-8068. void methods can swallow exceptions when going through failover path (todd)
+
+HADOOP-8116. RetriableCommand is using RetryPolicy incorrectly after HADOOP-7896. (atm)
diff --git a/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml b/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml
index 3624c99..855b028 100644
--- a/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml
+++ b/hadoop-common-project/hadoop-common/dev-support/findbugsExcludeFile.xml
@@ -278,8 +278,12 @@
<!-- protobuf generated code -->
<Class name="~org\.apache\.hadoop\.ipc\.protobuf\.ProtocolInfoProtos.*"/>
</Match>
- <Match>
+ <Match>
<!-- protobuf generated code -->
<Class name="~org\.apache\.hadoop\.ipc\.protobuf\.IpcConnectionContextProtos.*"/>
</Match>
+ <Match>
+ <!-- protobuf generated code -->
+ <Class name="~org\.apache\.hadoop\.ha\.proto\.HAServiceProtocolProtos.*"/>
+ </Match>
</FindBugsFilter>
diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml
index 12d98c6..fd18b607 100644
--- a/hadoop-common-project/hadoop-common/pom.xml
+++ b/hadoop-common-project/hadoop-common/pom.xml
@@ -263,6 +263,38 @@
<artifactId>json-simple</artifactId>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>com.jcraft</groupId>
+ <artifactId>jsch</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.zookeeper</groupId>
+ <artifactId>zookeeper</artifactId>
+ <version>3.4.2</version>
+ <exclusions>
+ <exclusion>
+ <!-- otherwise seems to drag in junit 3.8.1 via jline -->
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jdmk</groupId>
+ <artifactId>jmxtools</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jmx</groupId>
+ <artifactId>jmxri</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.zookeeper</groupId>
+ <artifactId>zookeeper</artifactId>
+ <version>3.4.2</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml
index b8f5f51..771ac05 100644
--- a/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml
+++ b/hadoop-common-project/hadoop-common/src/main/docs/src/documentation/content/xdocs/service_level_auth.xml
@@ -138,6 +138,12 @@
dfsadmin and mradmin commands to refresh the security policy in-effect.
</td>
</tr>
+ <tr>
+ <td><code>security.ha.service.protocol.acl</code></td>
+ <td>ACL for HAService protocol used by HAAdmin to manage the
+ active and stand-by states of namenode.
+ </td>
+ </tr>
</table>
</section>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
index f0ca72b..c2a6479 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
@@ -114,11 +114,12 @@
public static final String
HADOOP_SECURITY_SERVICE_AUTHORIZATION_REFRESH_USER_MAPPINGS =
"security.refresh.user.mappings.protocol.acl";
+ public static final String
+ SECURITY_HA_SERVICE_PROTOCOL_ACL = "security.ha.service.protocol.acl";
public static final String HADOOP_SECURITY_TOKEN_SERVICE_USE_IP =
"hadoop.security.token.service.use_ip";
public static final boolean HADOOP_SECURITY_TOKEN_SERVICE_USE_IP_DEFAULT =
true;
-
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java
index 401d07ab..7953411 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeysPublic.java
@@ -172,6 +172,11 @@
/** Default value for IPC_CLIENT_CONNECT_MAX_RETRIES_KEY */
public static final int IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 10;
/** See <a href="{@docRoot}/../core-default.html">core-default.xml</a> */
+ public static final String IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY =
+ "ipc.client.connect.max.retries.on.timeouts";
+ /** Default value for IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY */
+ public static final int IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT = 45;
+ /** See <a href="{@docRoot}/../core-default.html">core-default.xml</a> */
public static final String IPC_CLIENT_TCPNODELAY_KEY =
"ipc.client.tcpnodelay";
/** Defalt value for IPC_CLIENT_TCPNODELAY_KEY */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
new file mode 100644
index 0000000..7da2d3e
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
@@ -0,0 +1,593 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.zookeeper.data.ACL;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.Watcher;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.AsyncCallback.*;
+import org.apache.zookeeper.data.Stat;
+import org.apache.zookeeper.KeeperException.Code;
+
+import com.google.common.annotations.VisibleForTesting;
+
+/**
+ *
+ * This class implements a simple library to perform leader election on top of
+ * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
+ * can be performed by atomically creating an ephemeral lock file (znode) on
+ * Zookeeper. The service instance that successfully creates the znode becomes
+ * active and the rest become standbys. <br/>
+ * This election mechanism is only efficient for small number of election
+ * candidates (order of 10's) because contention on single znode by a large
+ * number of candidates can result in Zookeeper overload. <br/>
+ * The elector does not guarantee fencing (protection of shared resources) among
+ * service instances. After it has notified an instance about becoming a leader,
+ * then that instance must ensure that it meets the service consistency
+ * requirements. If it cannot do so, then it is recommended to quit the
+ * election. The application implements the {@link ActiveStandbyElectorCallback}
+ * to interact with the elector
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ActiveStandbyElector implements Watcher, StringCallback,
+ StatCallback {
+
+ /**
+ * Callback interface to interact with the ActiveStandbyElector object. <br/>
+ * The application will be notified with a callback only on state changes
+ * (i.e. there will never be successive calls to becomeActive without an
+ * intermediate call to enterNeutralMode). <br/>
+ * The callbacks will be running on Zookeeper client library threads. The
+ * application should return from these callbacks quickly so as not to impede
+ * Zookeeper client library performance and notifications. The app will
+ * typically remember the state change and return from the callback. It will
+ * then proceed with implementing actions around that state change. It is
+ * possible to be called back again while these actions are in flight and the
+ * app should handle this scenario.
+ */
+ public interface ActiveStandbyElectorCallback {
+ /**
+ * This method is called when the app becomes the active leader
+ */
+ void becomeActive();
+
+ /**
+ * This method is called when the app becomes a standby
+ */
+ void becomeStandby();
+
+ /**
+ * If the elector gets disconnected from Zookeeper and does not know about
+ * the lock state, then it will notify the service via the enterNeutralMode
+ * interface. The service may choose to ignore this or stop doing state
+ * changing operations. Upon reconnection, the elector verifies the leader
+ * status and calls back on the becomeActive and becomeStandby app
+ * interfaces. <br/>
+ * Zookeeper disconnects can happen due to network issues or loss of
+ * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
+ * split-brain issues. In such situations it might be prudent to call
+ * becomeStandby too. However, such state change operations might be
+ * expensive and enterNeutralMode can help guard against doing that for
+ * transient issues.
+ */
+ void enterNeutralMode();
+
+ /**
+ * If there is any fatal error (e.g. wrong ACL's, unexpected Zookeeper
+ * errors or Zookeeper persistent unavailability) then notifyFatalError is
+ * called to notify the app about it.
+ */
+ void notifyFatalError(String errorMessage);
+ }
+
+ /**
+ * Name of the lock znode used by the library. Protected for access in test
+ * classes
+ */
+ @VisibleForTesting
+ protected static final String LOCKFILENAME = "ActiveStandbyElectorLock";
+
+ public static final Log LOG = LogFactory.getLog(ActiveStandbyElector.class);
+
+ private static final int NUM_RETRIES = 3;
+
+ private enum ConnectionState {
+ DISCONNECTED, CONNECTED, TERMINATED
+ };
+
+ private enum State {
+ INIT, ACTIVE, STANDBY, NEUTRAL
+ };
+
+ private State state = State.INIT;
+ private int createRetryCount = 0;
+ private int statRetryCount = 0;
+ private ZooKeeper zkClient;
+ private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
+
+ private final ActiveStandbyElectorCallback appClient;
+ private final String zkHostPort;
+ private final int zkSessionTimeout;
+ private final List<ACL> zkAcl;
+ private byte[] appData;
+ private final String zkLockFilePath;
+ private final String znodeWorkingDir;
+
+ /**
+ * Create a new ActiveStandbyElector object <br/>
+ * The elector is created by providing to it the Zookeeper configuration, the
+ * parent znode under which to create the znode and a reference to the
+ * callback interface. <br/>
+ * The parent znode name must be the same for all service instances and
+ * different across services. <br/>
+ * After the leader has been lost, a new leader will be elected after the
+ * session timeout expires. Hence, the app must set this parameter based on
+ * its needs for failure response time. The session timeout must be greater
+ * than the Zookeeper disconnect timeout and is recommended to be 3X that
+ * value to enable Zookeeper to retry transient disconnections. Setting a very
+ * short session timeout may result in frequent transitions between active and
+ * standby states during issues like network outages/GS pauses.
+ *
+ * @param zookeeperHostPorts
+ * ZooKeeper hostPort for all ZooKeeper servers
+ * @param zookeeperSessionTimeout
+ * ZooKeeper session timeout
+ * @param parentZnodeName
+ * znode under which to create the lock
+ * @param acl
+ * ZooKeeper ACL's
+ * @param app
+ * reference to callback interface object
+ * @throws IOException
+ * @throws HadoopIllegalArgumentException
+ */
+ public ActiveStandbyElector(String zookeeperHostPorts,
+ int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
+ ActiveStandbyElectorCallback app) throws IOException,
+ HadoopIllegalArgumentException {
+ if (app == null || acl == null || parentZnodeName == null
+ || zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
+ throw new HadoopIllegalArgumentException("Invalid argument");
+ }
+ zkHostPort = zookeeperHostPorts;
+ zkSessionTimeout = zookeeperSessionTimeout;
+ zkAcl = acl;
+ appClient = app;
+ znodeWorkingDir = parentZnodeName;
+ zkLockFilePath = znodeWorkingDir + "/" + LOCKFILENAME;
+
+ // createConnection for future API calls
+ createConnection();
+ }
+
+ /**
+ * To participate in election, the app will call joinElection. The result will
+ * be notified by a callback on either the becomeActive or becomeStandby app
+ * interfaces. <br/>
+ * After this the elector will automatically monitor the leader status and
+ * perform re-election if necessary<br/>
+ * The app could potentially start off in standby mode and ignore the
+ * becomeStandby call.
+ *
+ * @param data
+ * to be set by the app. non-null data must be set.
+ * @throws HadoopIllegalArgumentException
+ * if valid data is not supplied
+ */
+ public synchronized void joinElection(byte[] data)
+ throws HadoopIllegalArgumentException {
+ LOG.debug("Attempting active election");
+
+ if (data == null) {
+ throw new HadoopIllegalArgumentException("data cannot be null");
+ }
+
+ appData = new byte[data.length];
+ System.arraycopy(data, 0, appData, 0, data.length);
+
+ joinElectionInternal();
+ }
+
+ /**
+ * Any service instance can drop out of the election by calling quitElection.
+ * <br/>
+ * This will lose any leader status, if held, and stop monitoring of the lock
+ * node. <br/>
+ * If the instance wants to participate in election again, then it needs to
+ * call joinElection(). <br/>
+ * This allows service instances to take themselves out of rotation for known
+ * impending unavailable states (e.g. long GC pause or software upgrade).
+ */
+ public synchronized void quitElection() {
+ LOG.debug("Yielding from election");
+ reset();
+ }
+
+ /**
+ * Exception thrown when there is no active leader
+ */
+ public static class ActiveNotFoundException extends Exception {
+ private static final long serialVersionUID = 3505396722342846462L;
+ }
+
+ /**
+ * get data set by the active leader
+ *
+ * @return data set by the active instance
+ * @throws ActiveNotFoundException
+ * when there is no active leader
+ * @throws KeeperException
+ * other zookeeper operation errors
+ * @throws InterruptedException
+ * @throws IOException
+ * when ZooKeeper connection could not be established
+ */
+ public synchronized byte[] getActiveData() throws ActiveNotFoundException,
+ KeeperException, InterruptedException, IOException {
+ try {
+ if (zkClient == null) {
+ createConnection();
+ }
+ Stat stat = new Stat();
+ return zkClient.getData(zkLockFilePath, false, stat);
+ } catch(KeeperException e) {
+ Code code = e.code();
+ if (operationNodeDoesNotExist(code)) {
+ // handle the commonly expected cases that make sense for us
+ throw new ActiveNotFoundException();
+ } else {
+ throw e;
+ }
+ }
+ }
+
+ /**
+ * interface implementation of Zookeeper callback for create
+ */
+ @Override
+ public synchronized void processResult(int rc, String path, Object ctx,
+ String name) {
+ LOG.debug("CreateNode result: " + rc + " for path: " + path
+ + " connectionState: " + zkConnectionState);
+ if (zkClient == null) {
+ // zkClient is nulled before closing the connection
+ // this is the callback with session expired after we closed the session
+ return;
+ }
+
+ Code code = Code.get(rc);
+ if (operationSuccess(code)) {
+ // we successfully created the znode. we are the leader. start monitoring
+ becomeActive();
+ monitorActiveStatus();
+ return;
+ }
+
+ if (operationNodeExists(code)) {
+ if (createRetryCount == 0) {
+ // znode exists and we did not retry the operation. so a different
+ // instance has created it. become standby and monitor lock.
+ becomeStandby();
+ }
+ // if we had retried then the znode could have been created by our first
+ // attempt to the server (that we lost) and this node exists response is
+ // for the second attempt. verify this case via ephemeral node owner. this
+ // will happen on the callback for monitoring the lock.
+ monitorActiveStatus();
+ return;
+ }
+
+ String errorMessage = "Received create error from Zookeeper. code:"
+ + code.toString();
+ LOG.debug(errorMessage);
+
+ if (operationRetry(code)) {
+ if (createRetryCount < NUM_RETRIES) {
+ LOG.debug("Retrying createNode createRetryCount: " + createRetryCount);
+ ++createRetryCount;
+ createNode();
+ return;
+ }
+ errorMessage = errorMessage
+ + ". Not retrying further znode create connection errors.";
+ }
+
+ fatalError(errorMessage);
+ }
+
+ /**
+ * interface implementation of Zookeeper callback for monitor (exists)
+ */
+ @Override
+ public synchronized void processResult(int rc, String path, Object ctx,
+ Stat stat) {
+ LOG.debug("StatNode result: " + rc + " for path: " + path
+ + " connectionState: " + zkConnectionState);
+ if (zkClient == null) {
+ // zkClient is nulled before closing the connection
+ // this is the callback with session expired after we closed the session
+ return;
+ }
+
+ Code code = Code.get(rc);
+ if (operationSuccess(code)) {
+ // the following owner check completes verification in case the lock znode
+ // creation was retried
+ if (stat.getEphemeralOwner() == zkClient.getSessionId()) {
+ // we own the lock znode. so we are the leader
+ becomeActive();
+ } else {
+ // we dont own the lock znode. so we are a standby.
+ becomeStandby();
+ }
+ // the watch set by us will notify about changes
+ return;
+ }
+
+ if (operationNodeDoesNotExist(code)) {
+ // the lock znode disappeared before we started monitoring it
+ enterNeutralMode();
+ joinElectionInternal();
+ return;
+ }
+
+ String errorMessage = "Received stat error from Zookeeper. code:"
+ + code.toString();
+ LOG.debug(errorMessage);
+
+ if (operationRetry(code)) {
+ if (statRetryCount < NUM_RETRIES) {
+ ++statRetryCount;
+ monitorNode();
+ return;
+ }
+ errorMessage = errorMessage
+ + ". Not retrying further znode monitoring connection errors.";
+ }
+
+ fatalError(errorMessage);
+ }
+
+ /**
+ * interface implementation of Zookeeper watch events (connection and node)
+ */
+ @Override
+ public synchronized void process(WatchedEvent event) {
+ Event.EventType eventType = event.getType();
+ LOG.debug("Watcher event type: " + eventType + " with state:"
+ + event.getState() + " for path:" + event.getPath()
+ + " connectionState: " + zkConnectionState);
+ if (zkClient == null) {
+ // zkClient is nulled before closing the connection
+ // this is the callback with session expired after we closed the session
+ return;
+ }
+
+ if (eventType == Event.EventType.None) {
+ // the connection state has changed
+ switch (event.getState()) {
+ case SyncConnected:
+ // if the listener was asked to move to safe state then it needs to
+ // be undone
+ ConnectionState prevConnectionState = zkConnectionState;
+ zkConnectionState = ConnectionState.CONNECTED;
+ if (prevConnectionState == ConnectionState.DISCONNECTED) {
+ monitorActiveStatus();
+ }
+ break;
+ case Disconnected:
+ // ask the app to move to safe state because zookeeper connection
+ // is not active and we dont know our state
+ zkConnectionState = ConnectionState.DISCONNECTED;
+ enterNeutralMode();
+ break;
+ case Expired:
+ // the connection got terminated because of session timeout
+ // call listener to reconnect
+ enterNeutralMode();
+ reJoinElection();
+ break;
+ default:
+ fatalError("Unexpected Zookeeper watch event state: "
+ + event.getState());
+ break;
+ }
+
+ return;
+ }
+
+ // a watch on lock path in zookeeper has fired. so something has changed on
+ // the lock. ideally we should check that the path is the same as the lock
+ // path but trusting zookeeper for now
+ String path = event.getPath();
+ if (path != null) {
+ switch (eventType) {
+ case NodeDeleted:
+ if (state == State.ACTIVE) {
+ enterNeutralMode();
+ }
+ joinElectionInternal();
+ break;
+ case NodeDataChanged:
+ monitorActiveStatus();
+ break;
+ default:
+ LOG.debug("Unexpected node event: " + eventType + " for path: " + path);
+ monitorActiveStatus();
+ }
+
+ return;
+ }
+
+ // some unexpected error has occurred
+ fatalError("Unexpected watch error from Zookeeper");
+ }
+
+ /**
+ * Get a new zookeeper client instance. protected so that test class can
+ * inherit and pass in a mock object for zookeeper
+ *
+ * @return new zookeeper client instance
+ * @throws IOException
+ */
+ protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
+ return new ZooKeeper(zkHostPort, zkSessionTimeout, this);
+ }
+
+ private void fatalError(String errorMessage) {
+ reset();
+ appClient.notifyFatalError(errorMessage);
+ }
+
+ private void monitorActiveStatus() {
+ LOG.debug("Monitoring active leader");
+ statRetryCount = 0;
+ monitorNode();
+ }
+
+ private void joinElectionInternal() {
+ if (zkClient == null) {
+ if (!reEstablishSession()) {
+ fatalError("Failed to reEstablish connection with ZooKeeper");
+ return;
+ }
+ }
+
+ createRetryCount = 0;
+ createNode();
+ }
+
+ private void reJoinElection() {
+ LOG.debug("Trying to re-establish ZK session");
+ terminateConnection();
+ joinElectionInternal();
+ }
+
+ private boolean reEstablishSession() {
+ int connectionRetryCount = 0;
+ boolean success = false;
+ while(!success && connectionRetryCount < NUM_RETRIES) {
+ LOG.debug("Establishing zookeeper connection");
+ try {
+ createConnection();
+ success = true;
+ } catch(IOException e) {
+ LOG.warn(e);
+ try {
+ Thread.sleep(5000);
+ } catch(InterruptedException e1) {
+ LOG.warn(e1);
+ }
+ }
+ ++connectionRetryCount;
+ }
+ return success;
+ }
+
+ private void createConnection() throws IOException {
+ zkClient = getNewZooKeeper();
+ }
+
+ private void terminateConnection() {
+ if (zkClient == null) {
+ return;
+ }
+ LOG.debug("Terminating ZK connection");
+ ZooKeeper tempZk = zkClient;
+ zkClient = null;
+ try {
+ tempZk.close();
+ } catch(InterruptedException e) {
+ LOG.warn(e);
+ }
+ zkConnectionState = ConnectionState.TERMINATED;
+ }
+
+ private void reset() {
+ state = State.INIT;
+ terminateConnection();
+ }
+
+ private void becomeActive() {
+ if (state != State.ACTIVE) {
+ LOG.debug("Becoming active");
+ state = State.ACTIVE;
+ appClient.becomeActive();
+ }
+ }
+
+ private void becomeStandby() {
+ if (state != State.STANDBY) {
+ LOG.debug("Becoming standby");
+ state = State.STANDBY;
+ appClient.becomeStandby();
+ }
+ }
+
+ private void enterNeutralMode() {
+ if (state != State.NEUTRAL) {
+ LOG.debug("Entering neutral mode");
+ state = State.NEUTRAL;
+ appClient.enterNeutralMode();
+ }
+ }
+
+ private void createNode() {
+ zkClient.create(zkLockFilePath, appData, zkAcl, CreateMode.EPHEMERAL, this,
+ null);
+ }
+
+ private void monitorNode() {
+ zkClient.exists(zkLockFilePath, true, this, null);
+ }
+
+ private boolean operationSuccess(Code code) {
+ return (code == Code.OK);
+ }
+
+ private boolean operationNodeExists(Code code) {
+ return (code == Code.NODEEXISTS);
+ }
+
+ private boolean operationNodeDoesNotExist(Code code) {
+ return (code == Code.NONODE);
+ }
+
+ private boolean operationRetry(Code code) {
+ switch (code) {
+ case CONNECTIONLOSS:
+ case OPERATIONTIMEOUT:
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java
new file mode 100644
index 0000000..3d3b1ba
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/BadFencingConfigurationException.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+
+/**
+ * Indicates that the operator has specified an invalid configuration
+ * for fencing methods.
+ */
+class BadFencingConfigurationException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ public BadFencingConfigurationException(String msg) {
+ super(msg);
+ }
+
+ public BadFencingConfigurationException(String msg, Throwable cause) {
+ super(msg, cause);
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java
new file mode 100644
index 0000000..0960fb7
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java
@@ -0,0 +1,184 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * The FailOverController is responsible for electing an active service
+ * on startup or when the current active is changing (eg due to failure),
+ * monitoring the health of a service, and performing a fail-over when a
+ * new active service is either manually selected by a user or elected.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class FailoverController {
+
+ private static final Log LOG = LogFactory.getLog(FailoverController.class);
+
+ /**
+ * Perform pre-failover checks on the given service we plan to
+ * failover to, eg to prevent failing over to a service (eg due
+ * to it being inaccessible, already active, not healthy, etc).
+ *
+ * An option to ignore toSvc if it claims it is not ready to
+ * become active is provided in case performing a failover will
+ * allow it to become active, eg because it triggers a log roll
+ * so the standby can learn about new blocks and leave safemode.
+ *
+ * @param toSvc service to make active
+ * @param toSvcName name of service to make active
+ * @param forceActive ignore toSvc if it reports that it is not ready
+ * @throws FailoverFailedException if we should avoid failover
+ */
+ private static void preFailoverChecks(HAServiceProtocol toSvc,
+ InetSocketAddress toSvcAddr,
+ boolean forceActive)
+ throws FailoverFailedException {
+ HAServiceState toSvcState;
+
+ try {
+ toSvcState = toSvc.getServiceState();
+ } catch (IOException e) {
+ String msg = "Unable to get service state for " + toSvcAddr;
+ LOG.error(msg, e);
+ throw new FailoverFailedException(msg, e);
+ }
+
+ if (!toSvcState.equals(HAServiceState.STANDBY)) {
+ throw new FailoverFailedException(
+ "Can't failover to an active service");
+ }
+
+ try {
+ HAServiceProtocolHelper.monitorHealth(toSvc);
+ } catch (HealthCheckFailedException hce) {
+ throw new FailoverFailedException(
+ "Can't failover to an unhealthy service", hce);
+ } catch (IOException e) {
+ throw new FailoverFailedException(
+ "Got an IO exception", e);
+ }
+
+ try {
+ if (!toSvc.readyToBecomeActive()) {
+ if (!forceActive) {
+ throw new FailoverFailedException(
+ toSvcAddr + " is not ready to become active");
+ }
+ }
+ } catch (IOException e) {
+ throw new FailoverFailedException(
+ "Got an IO exception", e);
+ }
+ }
+
+ /**
+ * Failover from service 1 to service 2. If the failover fails
+ * then try to failback.
+ *
+ * @param fromSvc currently active service
+ * @param fromSvcAddr addr of the currently active service
+ * @param toSvc service to make active
+ * @param toSvcAddr addr of the service to make active
+ * @param fencer for fencing fromSvc
+ * @param forceFence to fence fromSvc even if not strictly necessary
+ * @param forceActive try to make toSvc active even if it is not ready
+ * @throws FailoverFailedException if the failover fails
+ */
+ public static void failover(HAServiceProtocol fromSvc,
+ InetSocketAddress fromSvcAddr,
+ HAServiceProtocol toSvc,
+ InetSocketAddress toSvcAddr,
+ NodeFencer fencer,
+ boolean forceFence,
+ boolean forceActive)
+ throws FailoverFailedException {
+ Preconditions.checkArgument(fencer != null, "failover requires a fencer");
+ preFailoverChecks(toSvc, toSvcAddr, forceActive);
+
+ // Try to make fromSvc standby
+ boolean tryFence = true;
+ try {
+ HAServiceProtocolHelper.transitionToStandby(fromSvc);
+ // We should try to fence if we failed or it was forced
+ tryFence = forceFence ? true : false;
+ } catch (ServiceFailedException sfe) {
+ LOG.warn("Unable to make " + fromSvcAddr + " standby (" +
+ sfe.getMessage() + ")");
+ } catch (IOException ioe) {
+ LOG.warn("Unable to make " + fromSvcAddr +
+ " standby (unable to connect)", ioe);
+ }
+
+ // Fence fromSvc if it's required or forced by the user
+ if (tryFence) {
+ if (!fencer.fence(fromSvcAddr)) {
+ throw new FailoverFailedException("Unable to fence " +
+ fromSvcAddr + ". Fencing failed.");
+ }
+ }
+
+ // Try to make toSvc active
+ boolean failed = false;
+ Throwable cause = null;
+ try {
+ HAServiceProtocolHelper.transitionToActive(toSvc);
+ } catch (ServiceFailedException sfe) {
+ LOG.error("Unable to make " + toSvcAddr + " active (" +
+ sfe.getMessage() + "). Failing back.");
+ failed = true;
+ cause = sfe;
+ } catch (IOException ioe) {
+ LOG.error("Unable to make " + toSvcAddr +
+ " active (unable to connect). Failing back.", ioe);
+ failed = true;
+ cause = ioe;
+ }
+
+ // We failed to make toSvc active
+ if (failed) {
+ String msg = "Unable to failover to " + toSvcAddr;
+ // Only try to failback if we didn't fence fromSvc
+ if (!tryFence) {
+ try {
+ // Unconditionally fence toSvc in case it is still trying to
+ // become active, eg we timed out waiting for its response.
+ // Unconditionally force fromSvc to become active since it
+ // was previously active when we initiated failover.
+ failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true);
+ } catch (FailoverFailedException ffe) {
+ msg += ". Failback to " + fromSvcAddr +
+ " failed (" + ffe.getMessage() + ")";
+ LOG.fatal(msg);
+ }
+ }
+ throw new FailoverFailedException(msg, cause);
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java
new file mode 100644
index 0000000..09982b4
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverFailedException.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Exception thrown to indicate service failover has failed.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class FailoverFailedException extends Exception {
+ private static final long serialVersionUID = 1L;
+
+ public FailoverFailedException(final String message) {
+ super(message);
+ }
+
+ public FailoverFailedException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java
new file mode 100644
index 0000000..d8bda14
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FenceMethod.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.net.InetSocketAddress;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * A fencing method is a method by which one node can forcibly prevent
+ * another node from making continued progress. This might be implemented
+ * by killing a process on the other node, by denying the other node's
+ * access to shared storage, or by accessing a PDU to cut the other node's
+ * power.
+ * <p>
+ * Since these methods are often vendor- or device-specific, operators
+ * may implement this interface in order to achieve fencing.
+ * <p>
+ * Fencing is configured by the operator as an ordered list of methods to
+ * attempt. Each method will be tried in turn, and the next in the list
+ * will only be attempted if the previous one fails. See {@link NodeFencer}
+ * for more information.
+ * <p>
+ * If an implementation also implements {@link Configurable} then its
+ * <code>setConf</code> method will be called upon instantiation.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public interface FenceMethod {
+ /**
+ * Verify that the given fencing method's arguments are valid.
+ * @param args the arguments provided in the configuration. This may
+ * be null if the operator did not configure any arguments.
+ * @throws BadFencingConfigurationException if the arguments are invalid
+ */
+ public void checkArgs(String args) throws BadFencingConfigurationException;
+
+ /**
+ * Attempt to fence the target node.
+ * @param serviceAddr the address (host:ipcport) of the service to fence
+ * @param args the configured arguments, which were checked at startup by
+ * {@link #checkArgs(String)}
+ * @return true if fencing was successful, false if unsuccessful or
+ * indeterminate
+ * @throws BadFencingConfigurationException if the configuration was
+ * determined to be invalid only at runtime
+ */
+ public boolean tryFence(InetSocketAddress serviceAddr, String args)
+ throws BadFencingConfigurationException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java
new file mode 100644
index 0000000..3350692
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java
@@ -0,0 +1,321 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.InetSocketAddress;
+import java.util.Map;
+
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.ParseException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import com.google.common.collect.ImmutableMap;
+
+/**
+ * A command-line tool for making calls in the HAServiceProtocol.
+ * For example,. this can be used to force a service to standby or active
+ * mode, or to trigger a health-check.
+ */
+@InterfaceAudience.Private
+
+public abstract class HAAdmin extends Configured implements Tool {
+
+ private static final String FORCEFENCE = "forcefence";
+ private static final String FORCEACTIVE = "forceactive";
+
+ private static Map<String, UsageInfo> USAGE =
+ ImmutableMap.<String, UsageInfo>builder()
+ .put("-transitionToActive",
+ new UsageInfo("<serviceId>", "Transitions the service into Active state"))
+ .put("-transitionToStandby",
+ new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
+ .put("-failover",
+ new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
+ "Failover from the first service to the second.\n" +
+ "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
+ "Try to failover to the target service even if it is not ready if the " +
+ FORCEACTIVE + " option is used."))
+ .put("-getServiceState",
+ new UsageInfo("<serviceId>", "Returns the state of the service"))
+ .put("-checkHealth",
+ new UsageInfo("<serviceId>",
+ "Requests that the service perform a health check.\n" +
+ "The HAAdmin tool will exit with a non-zero exit code\n" +
+ "if the check fails."))
+ .put("-help",
+ new UsageInfo("<command>", "Displays help on the specified command"))
+ .build();
+
+ /** Output stream for errors, for use in tests */
+ protected PrintStream errOut = System.err;
+ PrintStream out = System.out;
+
+ protected String getUsageString() {
+ return "Usage: HAAdmin";
+ }
+
+ protected void printUsage(PrintStream errOut) {
+ errOut.println(getUsageString());
+ for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
+ String cmd = e.getKey();
+ UsageInfo usage = e.getValue();
+
+ errOut.println(" [" + cmd + " " + usage.args + "]");
+ }
+ errOut.println();
+ ToolRunner.printGenericCommandUsage(errOut);
+ }
+
+ private static void printUsage(PrintStream errOut, String cmd) {
+ UsageInfo usage = USAGE.get(cmd);
+ if (usage == null) {
+ throw new RuntimeException("No usage for cmd " + cmd);
+ }
+ errOut.println("Usage: HAAdmin [" + cmd + " " + usage.args + "]");
+ }
+
+ private int transitionToActive(final String[] argv)
+ throws IOException, ServiceFailedException {
+ if (argv.length != 2) {
+ errOut.println("transitionToActive: incorrect number of arguments");
+ printUsage(errOut, "-transitionToActive");
+ return -1;
+ }
+
+ HAServiceProtocol proto = getProtocol(argv[1]);
+ HAServiceProtocolHelper.transitionToActive(proto);
+ return 0;
+ }
+
+ private int transitionToStandby(final String[] argv)
+ throws IOException, ServiceFailedException {
+ if (argv.length != 2) {
+ errOut.println("transitionToStandby: incorrect number of arguments");
+ printUsage(errOut, "-transitionToStandby");
+ return -1;
+ }
+
+ HAServiceProtocol proto = getProtocol(argv[1]);
+ HAServiceProtocolHelper.transitionToStandby(proto);
+ return 0;
+ }
+
+ private int failover(final String[] argv)
+ throws IOException, ServiceFailedException {
+ Configuration conf = getConf();
+ boolean forceFence = false;
+ boolean forceActive = false;
+
+ Options failoverOpts = new Options();
+ // "-failover" isn't really an option but we need to add
+ // it to appease CommandLineParser
+ failoverOpts.addOption("failover", false, "failover");
+ failoverOpts.addOption(FORCEFENCE, false, "force fencing");
+ failoverOpts.addOption(FORCEACTIVE, false, "force failover");
+
+ CommandLineParser parser = new GnuParser();
+ CommandLine cmd;
+
+ try {
+ cmd = parser.parse(failoverOpts, argv);
+ forceFence = cmd.hasOption(FORCEFENCE);
+ forceActive = cmd.hasOption(FORCEACTIVE);
+ } catch (ParseException pe) {
+ errOut.println("failover: incorrect arguments");
+ printUsage(errOut, "-failover");
+ return -1;
+ }
+
+ int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
+ final String[] args = cmd.getArgs();
+
+ if (numOpts > 2 || args.length != 2) {
+ errOut.println("failover: incorrect arguments");
+ printUsage(errOut, "-failover");
+ return -1;
+ }
+
+ NodeFencer fencer;
+ try {
+ fencer = NodeFencer.create(conf);
+ } catch (BadFencingConfigurationException bfce) {
+ errOut.println("failover: incorrect fencing configuration: " +
+ bfce.getLocalizedMessage());
+ return -1;
+ }
+ if (fencer == null) {
+ errOut.println("failover: no fencer configured");
+ return -1;
+ }
+
+ InetSocketAddress addr1 =
+ NetUtils.createSocketAddr(getServiceAddr(args[0]));
+ InetSocketAddress addr2 =
+ NetUtils.createSocketAddr(getServiceAddr(args[1]));
+ HAServiceProtocol proto1 = getProtocol(args[0]);
+ HAServiceProtocol proto2 = getProtocol(args[1]);
+
+ try {
+ FailoverController.failover(proto1, addr1, proto2, addr2,
+ fencer, forceFence, forceActive);
+ out.println("Failover from "+args[0]+" to "+args[1]+" successful");
+ } catch (FailoverFailedException ffe) {
+ errOut.println("Failover failed: " + ffe.getLocalizedMessage());
+ return -1;
+ }
+ return 0;
+ }
+
+ private int checkHealth(final String[] argv)
+ throws IOException, ServiceFailedException {
+ if (argv.length != 2) {
+ errOut.println("checkHealth: incorrect number of arguments");
+ printUsage(errOut, "-checkHealth");
+ return -1;
+ }
+
+ HAServiceProtocol proto = getProtocol(argv[1]);
+ try {
+ HAServiceProtocolHelper.monitorHealth(proto);
+ } catch (HealthCheckFailedException e) {
+ errOut.println("Health check failed: " + e.getLocalizedMessage());
+ return -1;
+ }
+ return 0;
+ }
+
+ private int getServiceState(final String[] argv)
+ throws IOException, ServiceFailedException {
+ if (argv.length != 2) {
+ errOut.println("getServiceState: incorrect number of arguments");
+ printUsage(errOut, "-getServiceState");
+ return -1;
+ }
+
+ HAServiceProtocol proto = getProtocol(argv[1]);
+ out.println(proto.getServiceState());
+ return 0;
+ }
+
+ /**
+ * Return the serviceId as is, we are assuming it was
+ * given as a service address of form <host:ipcport>.
+ */
+ protected String getServiceAddr(String serviceId) {
+ return serviceId;
+ }
+
+ /**
+ * Return a proxy to the specified target service.
+ */
+ protected HAServiceProtocol getProtocol(String serviceId)
+ throws IOException {
+ String serviceAddr = getServiceAddr(serviceId);
+ InetSocketAddress addr = NetUtils.createSocketAddr(serviceAddr);
+ return new HAServiceProtocolClientSideTranslatorPB(addr, getConf());
+ }
+
+ @Override
+ public int run(String[] argv) throws Exception {
+ try {
+ return runCmd(argv);
+ } catch (IllegalArgumentException iae) {
+ errOut.println("Illegal argument: " + iae.getLocalizedMessage());
+ return -1;
+ } catch (IOException ioe) {
+ errOut.println("Operation failed: " + ioe.getLocalizedMessage());
+ return -1;
+ }
+ }
+
+ protected int runCmd(String[] argv) throws Exception {
+ if (argv.length < 1) {
+ printUsage(errOut);
+ return -1;
+ }
+
+ String cmd = argv[0];
+
+ if (!cmd.startsWith("-")) {
+ errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
+ printUsage(errOut);
+ return -1;
+ }
+
+ if ("-transitionToActive".equals(cmd)) {
+ return transitionToActive(argv);
+ } else if ("-transitionToStandby".equals(cmd)) {
+ return transitionToStandby(argv);
+ } else if ("-failover".equals(cmd)) {
+ return failover(argv);
+ } else if ("-getServiceState".equals(cmd)) {
+ return getServiceState(argv);
+ } else if ("-checkHealth".equals(cmd)) {
+ return checkHealth(argv);
+ } else if ("-help".equals(cmd)) {
+ return help(argv);
+ } else {
+ errOut.println(cmd.substring(1) + ": Unknown command");
+ printUsage(errOut);
+ return -1;
+ }
+ }
+
+ private int help(String[] argv) {
+ if (argv.length != 2) {
+ printUsage(errOut, "-help");
+ return -1;
+ }
+ String cmd = argv[1];
+ if (!cmd.startsWith("-")) {
+ cmd = "-" + cmd;
+ }
+ UsageInfo usageInfo = USAGE.get(cmd);
+ if (usageInfo == null) {
+ errOut.println(cmd + ": Unknown command");
+ printUsage(errOut);
+ return -1;
+ }
+
+ errOut.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
+ return 0;
+ }
+
+ private static class UsageInfo {
+ private final String args;
+ private final String help;
+
+ public UsageInfo(String args, String help) {
+ this.args = args;
+ this.help = help;
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java
new file mode 100644
index 0000000..18b10f9
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.security.KerberosInfo;
+
+import java.io.IOException;
+
+/**
+ * Protocol interface that provides High Availability related primitives to
+ * monitor and fail-over the service.
+ *
+ * This interface could be used by HA frameworks to manage the service.
+ */
+@KerberosInfo(
+ serverPrincipal=CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY)
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface HAServiceProtocol {
+ /**
+ * Initial version of the protocol
+ */
+ public static final long versionID = 1L;
+
+ /**
+ * An HA service may be in active or standby state. During
+ * startup, it is in an unknown INITIALIZING state.
+ */
+ public enum HAServiceState {
+ INITIALIZING("initializing"),
+ ACTIVE("active"),
+ STANDBY("standby");
+
+ private String name;
+
+ HAServiceState(String name) {
+ this.name = name;
+ }
+
+ public String toString() {
+ return name;
+ }
+ }
+
+ /**
+ * Monitor the health of service. This periodically called by the HA
+ * frameworks to monitor the health of the service.
+ *
+ * Service is expected to perform checks to ensure it is functional.
+ * If the service is not healthy due to failure or partial failure,
+ * it is expected to throw {@link HealthCheckFailedException}.
+ * The definition of service not healthy is left to the service.
+ *
+ * Note that when health check of an Active service fails,
+ * failover to standby may be done.
+ *
+ * @throws HealthCheckFailedException
+ * if the health check of a service fails.
+ * @throws AccessControlException
+ * if access is denied.
+ * @throws IOException
+ * if other errors happen
+ */
+ public void monitorHealth() throws HealthCheckFailedException,
+ AccessControlException,
+ IOException;
+
+ /**
+ * Request service to transition to active state. No operation, if the
+ * service is already in active state.
+ *
+ * @throws ServiceFailedException
+ * if transition from standby to active fails.
+ * @throws AccessControlException
+ * if access is denied.
+ * @throws IOException
+ * if other errors happen
+ */
+ public void transitionToActive() throws ServiceFailedException,
+ AccessControlException,
+ IOException;
+
+ /**
+ * Request service to transition to standby state. No operation, if the
+ * service is already in standby state.
+ *
+ * @throws ServiceFailedException
+ * if transition from active to standby fails.
+ * @throws AccessControlException
+ * if access is denied.
+ * @throws IOException
+ * if other errors happen
+ */
+ public void transitionToStandby() throws ServiceFailedException,
+ AccessControlException,
+ IOException;
+
+ /**
+ * Return the current state of the service.
+ *
+ * @throws AccessControlException
+ * if access is denied.
+ * @throws IOException
+ * if other errors happen
+ */
+ public HAServiceState getServiceState() throws AccessControlException,
+ IOException;
+
+ /**
+ * Return true if the service is capable and ready to transition
+ * from the standby state to the active state.
+ *
+ * @return true if the service is ready to become active, false otherwise.
+ * @throws AccessControlException
+ * if access is denied.
+ * @throws IOException
+ * if other errors happen
+ */
+ public boolean readyToBecomeActive() throws ServiceFailedException,
+ AccessControlException,
+ IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java
new file mode 100644
index 0000000..b8ee717
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocolHelper.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.ipc.RemoteException;
+
+/**
+ * Helper for making {@link HAServiceProtocol} RPC calls. This helper
+ * unwraps the {@link RemoteException} to specific exceptions.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class HAServiceProtocolHelper {
+ public static void monitorHealth(HAServiceProtocol svc)
+ throws IOException {
+ try {
+ svc.monitorHealth();
+ } catch (RemoteException e) {
+ throw e.unwrapRemoteException(HealthCheckFailedException.class);
+ }
+ }
+
+ public static void transitionToActive(HAServiceProtocol svc)
+ throws IOException {
+ try {
+ svc.transitionToActive();
+ } catch (RemoteException e) {
+ throw e.unwrapRemoteException(ServiceFailedException.class);
+ }
+ }
+
+ public static void transitionToStandby(HAServiceProtocol svc)
+ throws IOException {
+ try {
+ svc.transitionToStandby();
+ } catch (RemoteException e) {
+ throw e.unwrapRemoteException(ServiceFailedException.class);
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java
new file mode 100644
index 0000000..e636adf
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HealthCheckFailedException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Exception thrown to indicate that health check of a service failed.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class HealthCheckFailedException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ public HealthCheckFailedException(final String message) {
+ super(message);
+ }
+
+ public HealthCheckFailedException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java
new file mode 100644
index 0000000..34a2c8b
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.net.InetSocketAddress;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+
+/**
+ * This class parses the configured list of fencing methods, and
+ * is responsible for trying each one in turn while logging informative
+ * output.<p>
+ *
+ * The fencing methods are configured as a carriage-return separated list.
+ * Each line in the list is of the form:<p>
+ * <code>com.example.foo.MyMethod(arg string)</code>
+ * or
+ * <code>com.example.foo.MyMethod</code>
+ * The class provided must implement the {@link FenceMethod} interface.
+ * The fencing methods that ship with Hadoop may also be referred to
+ * by shortened names:<p>
+ * <ul>
+ * <li><code>shell(/path/to/some/script.sh args...)</code></li>
+ * <li><code>sshfence(...)</code> (see {@link SshFenceByTcpPort})
+ * </ul>
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class NodeFencer {
+ public static final String CONF_METHODS_KEY =
+ "dfs.ha.fencing.methods";
+
+ private static final String CLASS_RE = "([a-zA-Z0-9\\.\\$]+)";
+ private static final Pattern CLASS_WITH_ARGUMENT =
+ Pattern.compile(CLASS_RE + "\\((.+?)\\)");
+ private static final Pattern CLASS_WITHOUT_ARGUMENT =
+ Pattern.compile(CLASS_RE);
+ private static final Pattern HASH_COMMENT_RE =
+ Pattern.compile("#.*$");
+
+ private static final Log LOG = LogFactory.getLog(NodeFencer.class);
+
+ /**
+ * Standard fencing methods included with Hadoop.
+ */
+ private static final Map<String, Class<? extends FenceMethod>> STANDARD_METHODS =
+ ImmutableMap.<String, Class<? extends FenceMethod>>of(
+ "shell", ShellCommandFencer.class,
+ "sshfence", SshFenceByTcpPort.class);
+
+ private final List<FenceMethodWithArg> methods;
+
+ public NodeFencer(Configuration conf)
+ throws BadFencingConfigurationException {
+ this.methods = parseMethods(conf);
+ }
+
+ public static NodeFencer create(Configuration conf)
+ throws BadFencingConfigurationException {
+ String confStr = conf.get(CONF_METHODS_KEY);
+ if (confStr == null) {
+ return null;
+ }
+ return new NodeFencer(conf);
+ }
+
+ public boolean fence(InetSocketAddress serviceAddr) {
+ LOG.info("====== Beginning Service Fencing Process... ======");
+ int i = 0;
+ for (FenceMethodWithArg method : methods) {
+ LOG.info("Trying method " + (++i) + "/" + methods.size() +": " + method);
+
+ try {
+ if (method.method.tryFence(serviceAddr, method.arg)) {
+ LOG.info("====== Fencing successful by method " + method + " ======");
+ return true;
+ }
+ } catch (BadFencingConfigurationException e) {
+ LOG.error("Fencing method " + method + " misconfigured", e);
+ continue;
+ } catch (Throwable t) {
+ LOG.error("Fencing method " + method + " failed with an unexpected error.", t);
+ continue;
+ }
+ LOG.warn("Fencing method " + method + " was unsuccessful.");
+ }
+
+ LOG.error("Unable to fence service by any configured method.");
+ return false;
+ }
+
+ private static List<FenceMethodWithArg> parseMethods(Configuration conf)
+ throws BadFencingConfigurationException {
+ String confStr = conf.get(CONF_METHODS_KEY);
+ String[] lines = confStr.split("\\s*\n\\s*");
+
+ List<FenceMethodWithArg> methods = Lists.newArrayList();
+ for (String line : lines) {
+ line = HASH_COMMENT_RE.matcher(line).replaceAll("");
+ line = line.trim();
+ if (!line.isEmpty()) {
+ methods.add(parseMethod(conf, line));
+ }
+ }
+
+ return methods;
+ }
+
+ private static FenceMethodWithArg parseMethod(Configuration conf, String line)
+ throws BadFencingConfigurationException {
+ Matcher m;
+ if ((m = CLASS_WITH_ARGUMENT.matcher(line)).matches()) {
+ String className = m.group(1);
+ String arg = m.group(2);
+ return createFenceMethod(conf, className, arg);
+ } else if ((m = CLASS_WITHOUT_ARGUMENT.matcher(line)).matches()) {
+ String className = m.group(1);
+ return createFenceMethod(conf, className, null);
+ } else {
+ throw new BadFencingConfigurationException(
+ "Unable to parse line: '" + line + "'");
+ }
+ }
+
+ private static FenceMethodWithArg createFenceMethod(
+ Configuration conf, String clazzName, String arg)
+ throws BadFencingConfigurationException {
+
+ Class<?> clazz;
+ try {
+ // See if it's a short name for one of the built-in methods
+ clazz = STANDARD_METHODS.get(clazzName);
+ if (clazz == null) {
+ // Try to instantiate the user's custom method
+ clazz = Class.forName(clazzName);
+ }
+ } catch (Exception e) {
+ throw new BadFencingConfigurationException(
+ "Could not find configured fencing method " + clazzName,
+ e);
+ }
+
+ // Check that it implements the right interface
+ if (!FenceMethod.class.isAssignableFrom(clazz)) {
+ throw new BadFencingConfigurationException("Class " + clazzName +
+ " does not implement FenceMethod");
+ }
+
+ FenceMethod method = (FenceMethod)ReflectionUtils.newInstance(
+ clazz, conf);
+ method.checkArgs(arg);
+ return new FenceMethodWithArg(method, arg);
+ }
+
+ private static class FenceMethodWithArg {
+ private final FenceMethod method;
+ private final String arg;
+
+ private FenceMethodWithArg(FenceMethod method, String arg) {
+ this.method = method;
+ this.arg = arg;
+ }
+
+ public String toString() {
+ return method.getClass().getCanonicalName() + "(" + arg + ")";
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java
new file mode 100644
index 0000000..6f3e444
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ServiceFailedException.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+
+/**
+ * Exception thrown to indicate that an operation performed
+ * to modify the state of a service or application failed.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public class ServiceFailedException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ public ServiceFailedException(final String message) {
+ super(message);
+ }
+
+ public ServiceFailedException(String message, Throwable cause) {
+ super(message, cause);
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java
new file mode 100644
index 0000000..ca81f23
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.annotations.VisibleForTesting;
+
+/**
+ * Fencing method that runs a shell command. It should be specified
+ * in the fencing configuration like:<br>
+ * <code>
+ * shell(/path/to/my/script.sh arg1 arg2 ...)
+ * </code><br>
+ * The string between '(' and ')' is passed directly to a bash shell and
+ * may not include any closing parentheses.<p>
+ *
+ * The shell command will be run with an environment set up to contain
+ * all of the current Hadoop configuration variables, with the '_' character
+ * replacing any '.' characters in the configuration keys.<p>
+ *
+ * If the shell command returns an exit code of 0, the fencing is
+ * determined to be successful. If it returns any other exit code, the
+ * fencing was not successful and the next fencing method in the list
+ * will be attempted.<p>
+ *
+ * <em>Note:</em> this fencing method does not implement any timeout.
+ * If timeouts are necessary, they should be implemented in the shell
+ * script itself (eg by forking a subshell to kill its parent in
+ * some number of seconds).
+ */
+public class ShellCommandFencer
+ extends Configured implements FenceMethod {
+
+ /** Length at which to abbreviate command in long messages */
+ private static final int ABBREV_LENGTH = 20;
+
+ @VisibleForTesting
+ static Log LOG = LogFactory.getLog(
+ ShellCommandFencer.class);
+
+ @Override
+ public void checkArgs(String args) throws BadFencingConfigurationException {
+ if (args == null || args.isEmpty()) {
+ throw new BadFencingConfigurationException(
+ "No argument passed to 'shell' fencing method");
+ }
+ // Nothing else we can really check without actually running the command
+ }
+
+ @Override
+ public boolean tryFence(InetSocketAddress serviceAddr, String cmd) {
+ List<String> cmdList = Arrays.asList(cmd.split("\\s+"));
+
+ // Create arg list with service as the first argument
+ List<String> argList = new ArrayList<String>();
+ argList.add(cmdList.get(0));
+ argList.add(serviceAddr.getHostName() + ":" + serviceAddr.getPort());
+ argList.addAll(cmdList.subList(1, cmdList.size()));
+ String cmdWithSvc = StringUtils.join(" ", argList);
+
+ ProcessBuilder builder = new ProcessBuilder(
+ "bash", "-e", "-c", cmdWithSvc);
+ setConfAsEnvVars(builder.environment());
+
+ Process p;
+ try {
+ p = builder.start();
+ p.getOutputStream().close();
+ } catch (IOException e) {
+ LOG.warn("Unable to execute " + cmd, e);
+ return false;
+ }
+
+ String pid = tryGetPid(p);
+ LOG.info("Launched fencing command '" + cmd + "' with "
+ + ((pid != null) ? ("pid " + pid) : "unknown pid"));
+
+ String logPrefix = abbreviate(cmd, ABBREV_LENGTH);
+ if (pid != null) {
+ logPrefix = "[PID " + pid + "] " + logPrefix;
+ }
+
+ // Pump logs to stderr
+ StreamPumper errPumper = new StreamPumper(
+ LOG, logPrefix, p.getErrorStream(),
+ StreamPumper.StreamType.STDERR);
+ errPumper.start();
+
+ StreamPumper outPumper = new StreamPumper(
+ LOG, logPrefix, p.getInputStream(),
+ StreamPumper.StreamType.STDOUT);
+ outPumper.start();
+
+ int rc;
+ try {
+ rc = p.waitFor();
+ errPumper.join();
+ outPumper.join();
+ } catch (InterruptedException ie) {
+ LOG.warn("Interrupted while waiting for fencing command: " + cmd);
+ return false;
+ }
+
+ return rc == 0;
+ }
+
+ /**
+ * Abbreviate a string by putting '...' in the middle of it,
+ * in an attempt to keep logs from getting too messy.
+ * @param cmd the string to abbreviate
+ * @param len maximum length to abbreviate to
+ * @return abbreviated string
+ */
+ static String abbreviate(String cmd, int len) {
+ if (cmd.length() > len && len >= 5) {
+ int firstHalf = (len - 3) / 2;
+ int rem = len - firstHalf - 3;
+
+ return cmd.substring(0, firstHalf) +
+ "..." + cmd.substring(cmd.length() - rem);
+ } else {
+ return cmd;
+ }
+ }
+
+ /**
+ * Attempt to use evil reflection tricks to determine the
+ * pid of a launched process. This is helpful to ops
+ * if debugging a fencing process that might have gone
+ * wrong. If running on a system or JVM where this doesn't
+ * work, it will simply return null.
+ */
+ private static String tryGetPid(Process p) {
+ try {
+ Class<? extends Process> clazz = p.getClass();
+ if (clazz.getName().equals("java.lang.UNIXProcess")) {
+ Field f = clazz.getDeclaredField("pid");
+ f.setAccessible(true);
+ return String.valueOf(f.getInt(p));
+ } else {
+ LOG.trace("Unable to determine pid for " + p
+ + " since it is not a UNIXProcess");
+ return null;
+ }
+ } catch (Throwable t) {
+ LOG.trace("Unable to determine pid for " + p, t);
+ return null;
+ }
+ }
+
+ /**
+ * Set the environment of the subprocess to be the Configuration,
+ * with '.'s replaced by '_'s.
+ */
+ private void setConfAsEnvVars(Map<String, String> env) {
+ for (Map.Entry<String, String> pair : getConf()) {
+ env.put(pair.getKey().replace('.', '_'), pair.getValue());
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java
new file mode 100644
index 0000000..cec731c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configured;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.jcraft.jsch.ChannelExec;
+import com.jcraft.jsch.JSch;
+import com.jcraft.jsch.JSchException;
+import com.jcraft.jsch.Session;
+
+/**
+ * This fencing implementation sshes to the target node and uses
+ * <code>fuser</code> to kill the process listening on the service's
+ * TCP port. This is more accurate than using "jps" since it doesn't
+ * require parsing, and will work even if there are multiple service
+ * processes running on the same machine.<p>
+ * It returns a successful status code if:
+ * <ul>
+ * <li><code>fuser</code> indicates it successfully killed a process, <em>or</em>
+ * <li><code>nc -z</code> indicates that nothing is listening on the target port
+ * </ul>
+ * <p>
+ * This fencing mechanism is configured as following in the fencing method
+ * list:
+ * <code>sshfence([[username][:ssh-port]])</code>
+ * where the optional argument specifies the username and port to use
+ * with ssh.
+ * <p>
+ * In order to achieve passwordless SSH, the operator must also configure
+ * <code>dfs.ha.fencing.ssh.private-key-files<code> to point to an
+ * SSH key that has passphrase-less access to the given username and host.
+ */
+public class SshFenceByTcpPort extends Configured
+ implements FenceMethod {
+
+ static final Log LOG = LogFactory.getLog(
+ SshFenceByTcpPort.class);
+
+ static final String CONF_CONNECT_TIMEOUT_KEY =
+ "dfs.ha.fencing.ssh.connect-timeout";
+ private static final int CONF_CONNECT_TIMEOUT_DEFAULT =
+ 30*1000;
+ static final String CONF_IDENTITIES_KEY =
+ "dfs.ha.fencing.ssh.private-key-files";
+
+ /**
+ * Verify that the argument, if given, in the conf is parseable.
+ */
+ @Override
+ public void checkArgs(String argStr) throws BadFencingConfigurationException {
+ if (argStr != null) {
+ // Use a dummy service when checking the arguments defined
+ // in the configuration are parseable.
+ new Args(new InetSocketAddress("localhost", 8020), argStr);
+ }
+ }
+
+ @Override
+ public boolean tryFence(InetSocketAddress serviceAddr, String argsStr)
+ throws BadFencingConfigurationException {
+
+ Args args = new Args(serviceAddr, argsStr);
+
+ Session session;
+ try {
+ session = createSession(args);
+ } catch (JSchException e) {
+ LOG.warn("Unable to create SSH session", e);
+ return false;
+ }
+
+ LOG.info("Connecting to " + args.host + "...");
+
+ try {
+ session.connect(getSshConnectTimeout());
+ } catch (JSchException e) {
+ LOG.warn("Unable to connect to " + args.host
+ + " as user " + args.user, e);
+ return false;
+ }
+ LOG.info("Connected to " + args.host);
+
+ try {
+ return doFence(session, args.targetPort);
+ } catch (JSchException e) {
+ LOG.warn("Unable to achieve fencing on remote host", e);
+ return false;
+ } finally {
+ session.disconnect();
+ }
+ }
+
+
+ private Session createSession(Args args) throws JSchException {
+ JSch jsch = new JSch();
+ for (String keyFile : getKeyFiles()) {
+ jsch.addIdentity(keyFile);
+ }
+ JSch.setLogger(new LogAdapter());
+
+ Session session = jsch.getSession(args.user, args.host, args.sshPort);
+ session.setConfig("StrictHostKeyChecking", "no");
+ return session;
+ }
+
+ private boolean doFence(Session session, int port) throws JSchException {
+ try {
+ LOG.info("Looking for process running on port " + port);
+ int rc = execCommand(session,
+ "PATH=$PATH:/sbin:/usr/sbin fuser -v -k -n tcp " + port);
+ if (rc == 0) {
+ LOG.info("Successfully killed process that was " +
+ "listening on port " + port);
+ // exit code 0 indicates the process was successfully killed.
+ return true;
+ } else if (rc == 1) {
+ // exit code 1 indicates either that the process was not running
+ // or that fuser didn't have root privileges in order to find it
+ // (eg running as a different user)
+ LOG.info(
+ "Indeterminate response from trying to kill service. " +
+ "Verifying whether it is running using nc...");
+ rc = execCommand(session, "nc -z localhost 8020");
+ if (rc == 0) {
+ // the service is still listening - we are unable to fence
+ LOG.warn("Unable to fence - it is running but we cannot kill it");
+ return false;
+ } else {
+ LOG.info("Verified that the service is down.");
+ return true;
+ }
+ } else {
+ // other
+ }
+ LOG.info("rc: " + rc);
+ return rc == 0;
+ } catch (InterruptedException e) {
+ LOG.warn("Interrupted while trying to fence via ssh", e);
+ return false;
+ } catch (IOException e) {
+ LOG.warn("Unknown failure while trying to fence via ssh", e);
+ return false;
+ }
+ }
+
+ /**
+ * Execute a command through the ssh session, pumping its
+ * stderr and stdout to our own logs.
+ */
+ private int execCommand(Session session, String cmd)
+ throws JSchException, InterruptedException, IOException {
+ LOG.debug("Running cmd: " + cmd);
+ ChannelExec exec = null;
+ try {
+ exec = (ChannelExec)session.openChannel("exec");
+ exec.setCommand(cmd);
+ exec.setInputStream(null);
+ exec.connect();
+
+ // Pump stdout of the command to our WARN logs
+ StreamPumper outPumper = new StreamPumper(LOG, cmd + " via ssh",
+ exec.getInputStream(), StreamPumper.StreamType.STDOUT);
+ outPumper.start();
+
+ // Pump stderr of the command to our WARN logs
+ StreamPumper errPumper = new StreamPumper(LOG, cmd + " via ssh",
+ exec.getErrStream(), StreamPumper.StreamType.STDERR);
+ errPumper.start();
+
+ outPumper.join();
+ errPumper.join();
+ return exec.getExitStatus();
+ } finally {
+ cleanup(exec);
+ }
+ }
+
+ private static void cleanup(ChannelExec exec) {
+ if (exec != null) {
+ try {
+ exec.disconnect();
+ } catch (Throwable t) {
+ LOG.warn("Couldn't disconnect ssh channel", t);
+ }
+ }
+ }
+
+ private int getSshConnectTimeout() {
+ return getConf().getInt(
+ CONF_CONNECT_TIMEOUT_KEY, CONF_CONNECT_TIMEOUT_DEFAULT);
+ }
+
+ private Collection<String> getKeyFiles() {
+ return getConf().getTrimmedStringCollection(CONF_IDENTITIES_KEY);
+ }
+
+ /**
+ * Container for the parsed arg line for this fencing method.
+ */
+ @VisibleForTesting
+ static class Args {
+ private static final Pattern USER_PORT_RE = Pattern.compile(
+ "([^:]+?)?(?:\\:(\\d+))?");
+
+ private static final int DEFAULT_SSH_PORT = 22;
+
+ String host;
+ int targetPort;
+ String user;
+ int sshPort;
+
+ public Args(InetSocketAddress serviceAddr, String arg)
+ throws BadFencingConfigurationException {
+ host = serviceAddr.getHostName();
+ targetPort = serviceAddr.getPort();
+ user = System.getProperty("user.name");
+ sshPort = DEFAULT_SSH_PORT;
+
+ // Parse optional user and ssh port
+ if (arg != null && !"".equals(arg)) {
+ Matcher m = USER_PORT_RE.matcher(arg);
+ if (!m.matches()) {
+ throw new BadFencingConfigurationException(
+ "Unable to parse user and SSH port: "+ arg);
+ }
+ if (m.group(1) != null) {
+ user = m.group(1);
+ }
+ if (m.group(2) != null) {
+ sshPort = parseConfiggedPort(m.group(2));
+ }
+ }
+ }
+
+ private Integer parseConfiggedPort(String portStr)
+ throws BadFencingConfigurationException {
+ try {
+ return Integer.valueOf(portStr);
+ } catch (NumberFormatException nfe) {
+ throw new BadFencingConfigurationException(
+ "Port number '" + portStr + "' invalid");
+ }
+ }
+ }
+
+ /**
+ * Adapter from JSch's logger interface to our log4j
+ */
+ private static class LogAdapter implements com.jcraft.jsch.Logger {
+ static final Log LOG = LogFactory.getLog(
+ SshFenceByTcpPort.class.getName() + ".jsch");
+
+ public boolean isEnabled(int level) {
+ switch (level) {
+ case com.jcraft.jsch.Logger.DEBUG:
+ return LOG.isDebugEnabled();
+ case com.jcraft.jsch.Logger.INFO:
+ return LOG.isInfoEnabled();
+ case com.jcraft.jsch.Logger.WARN:
+ return LOG.isWarnEnabled();
+ case com.jcraft.jsch.Logger.ERROR:
+ return LOG.isErrorEnabled();
+ case com.jcraft.jsch.Logger.FATAL:
+ return LOG.isFatalEnabled();
+ default:
+ return false;
+ }
+ }
+
+ public void log(int level, String message) {
+ switch (level) {
+ case com.jcraft.jsch.Logger.DEBUG:
+ LOG.debug(message);
+ break;
+ case com.jcraft.jsch.Logger.INFO:
+ LOG.info(message);
+ break;
+ case com.jcraft.jsch.Logger.WARN:
+ LOG.warn(message);
+ break;
+ case com.jcraft.jsch.Logger.ERROR:
+ LOG.error(message);
+ break;
+ case com.jcraft.jsch.Logger.FATAL:
+ LOG.fatal(message);
+ break;
+ }
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java
new file mode 100644
index 0000000..8bc16af
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/StreamPumper.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import org.apache.commons.logging.Log;
+
+/**
+ * Class responsible for pumping the streams of the subprocess
+ * out to log4j. stderr is pumped to WARN level and stdout is
+ * pumped to INFO level
+ */
+class StreamPumper {
+ enum StreamType {
+ STDOUT, STDERR;
+ }
+
+ private final Log log;
+
+ final Thread thread;
+ final String logPrefix;
+ final StreamPumper.StreamType type;
+ private final InputStream stream;
+ private boolean started = false;
+
+ StreamPumper(final Log log, final String logPrefix,
+ final InputStream stream, final StreamType type) {
+ this.log = log;
+ this.logPrefix = logPrefix;
+ this.stream = stream;
+ this.type = type;
+
+ thread = new Thread(new Runnable() {
+ @Override
+ public void run() {
+ try {
+ pump();
+ } catch (Throwable t) {
+ ShellCommandFencer.LOG.warn(logPrefix +
+ ": Unable to pump output from " + type,
+ t);
+ }
+ }
+ }, logPrefix + ": StreamPumper for " + type);
+ thread.setDaemon(true);
+ }
+
+ void join() throws InterruptedException {
+ assert started;
+ thread.join();
+ }
+
+ void start() {
+ assert !started;
+ thread.start();
+ started = true;
+ }
+
+ protected void pump() throws IOException {
+ InputStreamReader inputStreamReader = new InputStreamReader(stream);
+ BufferedReader br = new BufferedReader(inputStreamReader);
+ String line = null;
+ while ((line = br.readLine()) != null) {
+ if (type == StreamType.STDOUT) {
+ log.info(logPrefix + ": " + line);
+ } else {
+ log.warn(logPrefix + ": " + line);
+ }
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java
new file mode 100644
index 0000000..3bf4f6f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolClientSideTranslatorPB.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha.protocolPB;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceStateProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyRequestProto;
+import org.apache.hadoop.ipc.ProtobufHelper;
+import org.apache.hadoop.ipc.ProtobufRpcEngine;
+import org.apache.hadoop.ipc.ProtocolSignature;
+import org.apache.hadoop.ipc.RPC;
+
+import com.google.protobuf.RpcController;
+import com.google.protobuf.ServiceException;
+
+/**
+ * This class is the client side translator to translate the requests made on
+ * {@link HAServiceProtocol} interfaces to the RPC server implementing
+ * {@link HAServiceProtocolPB}.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Stable
+public class HAServiceProtocolClientSideTranslatorPB implements
+ HAServiceProtocol, Closeable {
+ /** RpcController is not used and hence is set to null */
+ private final static RpcController NULL_CONTROLLER = null;
+ private final static MonitorHealthRequestProto MONITOR_HEALTH_REQ =
+ MonitorHealthRequestProto.newBuilder().build();
+ private final static TransitionToActiveRequestProto TRANSITION_TO_ACTIVE_REQ =
+ TransitionToActiveRequestProto.newBuilder().build();
+ private final static TransitionToStandbyRequestProto TRANSITION_TO_STANDBY_REQ =
+ TransitionToStandbyRequestProto.newBuilder().build();
+ private final static GetServiceStateRequestProto GET_SERVICE_STATE_REQ =
+ GetServiceStateRequestProto.newBuilder().build();
+ private final static ReadyToBecomeActiveRequestProto ACTIVE_READY_REQ =
+ ReadyToBecomeActiveRequestProto.newBuilder().build();
+
+ private final HAServiceProtocolPB rpcProxy;
+
+ public HAServiceProtocolClientSideTranslatorPB(InetSocketAddress addr,
+ Configuration conf) throws IOException {
+ RPC.setProtocolEngine(conf, HAServiceProtocolPB.class,
+ ProtobufRpcEngine.class);
+ rpcProxy = RPC.getProxy(HAServiceProtocolPB.class,
+ RPC.getProtocolVersion(HAServiceProtocolPB.class), addr, conf);
+ }
+
+ @Override
+ public void monitorHealth() throws IOException {
+ try {
+ rpcProxy.monitorHealth(NULL_CONTROLLER, MONITOR_HEALTH_REQ);
+ } catch (ServiceException e) {
+ throw ProtobufHelper.getRemoteException(e);
+ }
+ }
+
+ @Override
+ public void transitionToActive() throws IOException {
+ try {
+ rpcProxy.transitionToActive(NULL_CONTROLLER, TRANSITION_TO_ACTIVE_REQ);
+ } catch (ServiceException e) {
+ throw ProtobufHelper.getRemoteException(e);
+ }
+ }
+
+ @Override
+ public void transitionToStandby() throws IOException {
+ try {
+ rpcProxy.transitionToStandby(NULL_CONTROLLER, TRANSITION_TO_STANDBY_REQ);
+ } catch (ServiceException e) {
+ throw ProtobufHelper.getRemoteException(e);
+ }
+ }
+
+ @Override
+ public HAServiceState getServiceState() throws IOException {
+ HAServiceStateProto state;
+ try {
+ state = rpcProxy.getServiceState(NULL_CONTROLLER,
+ GET_SERVICE_STATE_REQ).getState();
+ } catch (ServiceException e) {
+ throw ProtobufHelper.getRemoteException(e);
+ }
+ switch(state) {
+ case ACTIVE:
+ return HAServiceState.ACTIVE;
+ case STANDBY:
+ return HAServiceState.STANDBY;
+ case INITIALIZING:
+ default:
+ return HAServiceState.INITIALIZING;
+ }
+ }
+
+ @Override
+ public void close() {
+ RPC.stopProxy(rpcProxy);
+ }
+
+ @Override
+ public boolean readyToBecomeActive() throws IOException {
+ try {
+ return rpcProxy.readyToBecomeActive(NULL_CONTROLLER, ACTIVE_READY_REQ)
+ .getReadyToBecomeActive();
+ } catch (ServiceException e) {
+ throw ProtobufHelper.getRemoteException(e);
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java
new file mode 100644
index 0000000..57eefce
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolPB.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha.protocolPB;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceProtocolService;
+import org.apache.hadoop.ipc.ProtocolInfo;
+import org.apache.hadoop.ipc.VersionedProtocol;
+import org.apache.hadoop.security.KerberosInfo;
+
+@KerberosInfo(
+ serverPrincipal=CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY)
+@ProtocolInfo(protocolName = "org.apache.hadoop.ha.HAServiceProtocol",
+ protocolVersion = 1)
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface HAServiceProtocolPB extends
+ HAServiceProtocolService.BlockingInterface, VersionedProtocol {
+ /**
+ * If any methods need annotation, it can be added here
+ */
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java
new file mode 100644
index 0000000..3655a4e
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/protocolPB/HAServiceProtocolServerSideTranslatorPB.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha.protocolPB;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.GetServiceStateResponseProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceStateProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.MonitorHealthResponseProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.ReadyToBecomeActiveResponseProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToActiveResponseProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyRequestProto;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.TransitionToStandbyResponseProto;
+import org.apache.hadoop.ipc.ProtocolSignature;
+import org.apache.hadoop.ipc.RPC;
+
+import com.google.protobuf.RpcController;
+import com.google.protobuf.ServiceException;
+
+/**
+ * This class is used on the server side. Calls come across the wire for the
+ * for protocol {@link HAServiceProtocolPB}.
+ * This class translates the PB data types
+ * to the native data types used inside the NN as specified in the generic
+ * ClientProtocol.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Stable
+public class HAServiceProtocolServerSideTranslatorPB implements
+ HAServiceProtocolPB {
+ private final HAServiceProtocol server;
+ private static final MonitorHealthResponseProto MONITOR_HEALTH_RESP =
+ MonitorHealthResponseProto.newBuilder().build();
+ private static final TransitionToActiveResponseProto TRANSITION_TO_ACTIVE_RESP =
+ TransitionToActiveResponseProto.newBuilder().build();
+ private static final TransitionToStandbyResponseProto TRANSITION_TO_STANDBY_RESP =
+ TransitionToStandbyResponseProto.newBuilder().build();
+
+ public HAServiceProtocolServerSideTranslatorPB(HAServiceProtocol server) {
+ this.server = server;
+ }
+
+ @Override
+ public MonitorHealthResponseProto monitorHealth(RpcController controller,
+ MonitorHealthRequestProto request) throws ServiceException {
+ try {
+ server.monitorHealth();
+ return MONITOR_HEALTH_RESP;
+ } catch(IOException e) {
+ throw new ServiceException(e);
+ }
+ }
+
+ @Override
+ public TransitionToActiveResponseProto transitionToActive(
+ RpcController controller, TransitionToActiveRequestProto request)
+ throws ServiceException {
+ try {
+ server.transitionToActive();
+ return TRANSITION_TO_ACTIVE_RESP;
+ } catch(IOException e) {
+ throw new ServiceException(e);
+ }
+ }
+
+ @Override
+ public TransitionToStandbyResponseProto transitionToStandby(
+ RpcController controller, TransitionToStandbyRequestProto request)
+ throws ServiceException {
+ try {
+ server.transitionToStandby();
+ return TRANSITION_TO_STANDBY_RESP;
+ } catch(IOException e) {
+ throw new ServiceException(e);
+ }
+ }
+
+ @Override
+ public GetServiceStateResponseProto getServiceState(RpcController controller,
+ GetServiceStateRequestProto request) throws ServiceException {
+ HAServiceState s;
+ try {
+ s = server.getServiceState();
+ } catch(IOException e) {
+ throw new ServiceException(e);
+ }
+
+ HAServiceStateProto ret;
+ switch (s) {
+ case ACTIVE:
+ ret = HAServiceStateProto.ACTIVE;
+ break;
+ case STANDBY:
+ ret = HAServiceStateProto.STANDBY;
+ break;
+ case INITIALIZING:
+ default:
+ ret = HAServiceStateProto.INITIALIZING;
+ break;
+ }
+ return GetServiceStateResponseProto.newBuilder().setState(ret).build();
+ }
+
+ @Override
+ public long getProtocolVersion(String protocol, long clientVersion)
+ throws IOException {
+ return RPC.getProtocolVersion(HAServiceProtocolPB.class);
+ }
+
+ @Override
+ public ProtocolSignature getProtocolSignature(String protocol,
+ long clientVersion, int clientMethodsHash) throws IOException {
+ if (!protocol.equals(RPC.getProtocolName(HAServiceProtocolPB.class))) {
+ throw new IOException("Serverside implements " +
+ RPC.getProtocolName(HAServiceProtocolPB.class) +
+ ". The following requested protocol is unknown: " + protocol);
+ }
+
+ return ProtocolSignature.getProtocolSignature(clientMethodsHash,
+ RPC.getProtocolVersion(HAServiceProtocolPB.class),
+ HAServiceProtocolPB.class);
+ }
+
+ @Override
+ public ReadyToBecomeActiveResponseProto readyToBecomeActive(
+ RpcController controller, ReadyToBecomeActiveRequestProto request)
+ throws ServiceException {
+ try {
+ return ReadyToBecomeActiveResponseProto.newBuilder()
+ .setReadyToBecomeActive(server.readyToBecomeActive()).build();
+ } catch (IOException e) {
+ throw new ServiceException(e);
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java
index 812a46e..ae37d0b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/DefaultFailoverProxyProvider.java
@@ -27,28 +27,28 @@
* event of failover, and always returns the same proxy object.
*/
@InterfaceStability.Evolving
-public class DefaultFailoverProxyProvider implements FailoverProxyProvider {
+public class DefaultFailoverProxyProvider<T> implements FailoverProxyProvider<T> {
- private Object proxy;
- private Class<?> iface;
+ private T proxy;
+ private Class<T> iface;
- public DefaultFailoverProxyProvider(Class<?> iface, Object proxy) {
+ public DefaultFailoverProxyProvider(Class<T> iface, T proxy) {
this.proxy = proxy;
this.iface = iface;
}
@Override
- public Class<?> getInterface() {
+ public Class<T> getInterface() {
return iface;
}
@Override
- public Object getProxy() {
+ public T getProxy() {
return proxy;
}
@Override
- public void performFailover(Object currentProxy) {
+ public void performFailover(T currentProxy) {
// Nothing to do.
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java
index 707a40d..ba7d29f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/FailoverProxyProvider.java
@@ -29,7 +29,7 @@
* {@link RetryPolicy}.
*/
@InterfaceStability.Evolving
-public interface FailoverProxyProvider extends Closeable {
+public interface FailoverProxyProvider<T> extends Closeable {
/**
* Get the proxy object which should be used until the next failover event
@@ -37,7 +37,7 @@
*
* @return the proxy object to invoke methods upon
*/
- public Object getProxy();
+ public T getProxy();
/**
* Called whenever the associated {@link RetryPolicy} determines that an error
@@ -46,7 +46,7 @@
* @param currentProxy the proxy object which was being used before this
* failover event
*/
- public void performFailover(Object currentProxy);
+ public void performFailover(T currentProxy);
/**
* Return a reference to the interface this provider's proxy objects actually
@@ -58,5 +58,5 @@
* @return the interface implemented by the proxy objects returned by
* {@link FailoverProxyProvider#getProxy()}
*/
- public Class<?> getInterface();
+ public Class<T> getInterface();
}
\ No newline at end of file
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java
index 0dad53b..323542c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java
@@ -20,14 +20,15 @@
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
-import java.lang.reflect.Proxy;
import java.util.Collections;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.retry.RetryPolicy.RetryAction;
+import org.apache.hadoop.util.ThreadUtil;
import org.apache.hadoop.ipc.Client.ConnectionId;
+import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcInvocationHandler;
class RetryInvocationHandler implements RpcInvocationHandler {
@@ -38,6 +39,7 @@
* The number of times the associated proxyProvider has ever been failed over.
*/
private long proxyProviderFailoverCount = 0;
+ private volatile boolean hasMadeASuccessfulCall = false;
private RetryPolicy defaultPolicy;
private Map<String,RetryPolicy> methodNameToPolicyMap;
@@ -78,47 +80,82 @@
invocationAttemptFailoverCount = proxyProviderFailoverCount;
}
try {
- return invokeMethod(method, args);
+ Object ret = invokeMethod(method, args);
+ hasMadeASuccessfulCall = true;
+ return ret;
} catch (Exception e) {
boolean isMethodIdempotent = proxyProvider.getInterface()
.getMethod(method.getName(), method.getParameterTypes())
.isAnnotationPresent(Idempotent.class);
RetryAction action = policy.shouldRetry(e, retries++, invocationFailoverCount,
isMethodIdempotent);
- if (action == RetryAction.FAIL) {
- LOG.warn("Exception while invoking " + method.getName()
- + " of " + currentProxy.getClass() + ". Not retrying.", e);
- if (!method.getReturnType().equals(Void.TYPE)) {
- throw e; // non-void methods can't fail without an exception
+ if (action.action == RetryAction.RetryDecision.FAIL) {
+ if (action.reason != null) {
+ LOG.warn("Exception while invoking " +
+ currentProxy.getClass() + "." + method.getName() +
+ ". Not retrying because " + action.reason, e);
}
- return null;
- } else if (action == RetryAction.FAILOVER_AND_RETRY) {
- LOG.warn("Exception while invoking " + method.getName()
- + " of " + currentProxy.getClass()
- + " after " + invocationFailoverCount + " fail over attempts."
- + " Trying to fail over.", e);
- // Make sure that concurrent failed method invocations only cause a
- // single actual fail over.
- synchronized (proxyProvider) {
- if (invocationAttemptFailoverCount == proxyProviderFailoverCount) {
- proxyProvider.performFailover(currentProxy);
- proxyProviderFailoverCount++;
- currentProxy = proxyProvider.getProxy();
+ throw e;
+ } else { // retry or failover
+ // avoid logging the failover if this is the first call on this
+ // proxy object, and we successfully achieve the failover without
+ // any flip-flopping
+ boolean worthLogging =
+ !(invocationFailoverCount == 0 && !hasMadeASuccessfulCall);
+ worthLogging |= LOG.isDebugEnabled();
+ if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY &&
+ worthLogging) {
+ String msg = "Exception while invoking " + method.getName()
+ + " of class " + currentProxy.getClass().getSimpleName();
+ if (invocationFailoverCount > 0) {
+ msg += " after " + invocationFailoverCount + " fail over attempts";
+ }
+ msg += ". Trying to fail over " + formatSleepMessage(action.delayMillis);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(msg, e);
} else {
- LOG.warn("A failover has occurred since the start of this method"
- + " invocation attempt.");
+ LOG.warn(msg);
+ }
+ } else {
+ if(LOG.isDebugEnabled()) {
+ LOG.debug("Exception while invoking " + method.getName()
+ + " of class " + currentProxy.getClass().getSimpleName() +
+ ". Retrying " + formatSleepMessage(action.delayMillis), e);
}
}
- invocationFailoverCount++;
- }
- if(LOG.isDebugEnabled()) {
- LOG.debug("Exception while invoking " + method.getName()
- + " of " + currentProxy.getClass() + ". Retrying.", e);
+
+ if (action.delayMillis > 0) {
+ ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis);
+ }
+
+ if (action.action == RetryAction.RetryDecision.FAILOVER_AND_RETRY) {
+ // Make sure that concurrent failed method invocations only cause a
+ // single actual fail over.
+ synchronized (proxyProvider) {
+ if (invocationAttemptFailoverCount == proxyProviderFailoverCount) {
+ proxyProvider.performFailover(currentProxy);
+ proxyProviderFailoverCount++;
+ currentProxy = proxyProvider.getProxy();
+ } else {
+ LOG.warn("A failover has occurred since the start of this method"
+ + " invocation attempt.");
+ }
+ }
+ invocationFailoverCount++;
+ }
}
}
}
}
-
+
+ private static String formatSleepMessage(long millis) {
+ if (millis > 0) {
+ return "after sleeping for " + millis + "ms.";
+ } else {
+ return "immediately.";
+ }
+ }
+
private Object invokeMethod(Method method, Object[] args) throws Throwable {
try {
if (!method.isAccessible()) {
@@ -137,9 +174,7 @@
@Override //RpcInvocationHandler
public ConnectionId getConnectionId() {
- RpcInvocationHandler inv = (RpcInvocationHandler) Proxy
- .getInvocationHandler(currentProxy);
- return inv.getConnectionId();
+ return RPC.getConnectionIdForProxy(currentProxy);
}
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java
index 3634e18..2be8b75 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java
@@ -33,6 +33,8 @@
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.StandbyException;
+import com.google.common.annotations.VisibleForTesting;
+
/**
* <p>
* A collection of useful implementations of {@link RetryPolicy}.
@@ -42,6 +44,8 @@
public static final Log LOG = LogFactory.getLog(RetryPolicies.class);
+ private static final Random RAND = new Random();
+
/**
* <p>
* Try once, and fail by re-throwing the exception.
@@ -52,14 +56,6 @@
/**
* <p>
- * Try once, and fail silently for <code>void</code> methods, or by
- * re-throwing the exception for non-<code>void</code> methods.
- * </p>
- */
- public static final RetryPolicy TRY_ONCE_DONT_FAIL = new TryOnceDontFail();
-
- /**
- * <p>
* Keep trying forever.
* </p>
*/
@@ -137,18 +133,19 @@
public static final RetryPolicy failoverOnNetworkException(
RetryPolicy fallbackPolicy, int maxFailovers) {
- return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers);
+ return failoverOnNetworkException(fallbackPolicy, maxFailovers, 0, 0);
+ }
+
+ public static final RetryPolicy failoverOnNetworkException(
+ RetryPolicy fallbackPolicy, int maxFailovers, long delayMillis,
+ long maxDelayBase) {
+ return new FailoverOnNetworkExceptionRetry(fallbackPolicy, maxFailovers,
+ delayMillis, maxDelayBase);
}
static class TryOnceThenFail implements RetryPolicy {
public RetryAction shouldRetry(Exception e, int retries, int failovers,
boolean isMethodIdempotent) throws Exception {
- throw e;
- }
- }
- static class TryOnceDontFail implements RetryPolicy {
- public RetryAction shouldRetry(Exception e, int retries, int failovers,
- boolean isMethodIdempotent) throws Exception {
return RetryAction.FAIL;
}
}
@@ -174,14 +171,10 @@
public RetryAction shouldRetry(Exception e, int retries, int failovers,
boolean isMethodIdempotent) throws Exception {
if (retries >= maxRetries) {
- throw e;
+ return RetryAction.FAIL;
}
- try {
- timeUnit.sleep(calculateSleepTime(retries));
- } catch (InterruptedException ie) {
- // retry
- }
- return RetryAction.RETRY;
+ return new RetryAction(RetryAction.RetryDecision.RETRY,
+ timeUnit.toMillis(calculateSleepTime(retries)));
}
protected abstract long calculateSleepTime(int retries);
@@ -268,7 +261,7 @@
}
static class ExponentialBackoffRetry extends RetryLimited {
- private Random r = new Random();
+
public ExponentialBackoffRetry(
int maxRetries, long sleepTime, TimeUnit timeUnit) {
super(maxRetries, sleepTime, timeUnit);
@@ -276,16 +269,19 @@
@Override
protected long calculateSleepTime(int retries) {
- return sleepTime*r.nextInt(1<<(retries+1));
+ return calculateExponentialTime(sleepTime, retries + 1);
}
}
- /*
+ /**
* Fail over and retry in the case of:
* Remote StandbyException (server is up, but is not the active server)
* Immediate socket exceptions (e.g. no route to host, econnrefused)
* Socket exceptions after initial connection when operation is idempotent
*
+ * The first failover is immediate, while all subsequent failovers wait an
+ * exponentially-increasing random amount of time.
+ *
* Fail immediately in the case of:
* Socket exceptions after initial connection when operation is not idempotent
*
@@ -295,33 +291,49 @@
private RetryPolicy fallbackPolicy;
private int maxFailovers;
+ private long delayMillis;
+ private long maxDelayBase;
public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
int maxFailovers) {
+ this(fallbackPolicy, maxFailovers, 0, 0);
+ }
+
+ public FailoverOnNetworkExceptionRetry(RetryPolicy fallbackPolicy,
+ int maxFailovers, long delayMillis, long maxDelayBase) {
this.fallbackPolicy = fallbackPolicy;
this.maxFailovers = maxFailovers;
+ this.delayMillis = delayMillis;
+ this.maxDelayBase = maxDelayBase;
}
@Override
public RetryAction shouldRetry(Exception e, int retries,
int failovers, boolean isMethodIdempotent) throws Exception {
if (failovers >= maxFailovers) {
- LOG.info("Failovers (" + failovers + ") exceeded maximum allowed ("
+ return new RetryAction(RetryAction.RetryDecision.FAIL, 0,
+ "failovers (" + failovers + ") exceeded maximum allowed ("
+ maxFailovers + ")");
- return RetryAction.FAIL;
}
if (e instanceof ConnectException ||
e instanceof NoRouteToHostException ||
e instanceof UnknownHostException ||
- e instanceof StandbyException) {
- return RetryAction.FAILOVER_AND_RETRY;
+ e instanceof StandbyException ||
+ isWrappedStandbyException(e)) {
+ return new RetryAction(
+ RetryAction.RetryDecision.FAILOVER_AND_RETRY,
+ // retry immediately if this is our first failover, sleep otherwise
+ failovers == 0 ? 0 :
+ calculateExponentialTime(delayMillis, failovers, maxDelayBase));
} else if (e instanceof SocketException ||
- e instanceof IOException) {
+ (e instanceof IOException && !(e instanceof RemoteException))) {
if (isMethodIdempotent) {
return RetryAction.FAILOVER_AND_RETRY;
} else {
- return RetryAction.FAIL;
+ return new RetryAction(RetryAction.RetryDecision.FAIL, 0,
+ "the invoked method is not idempotent, and unable to determine " +
+ "whether it was invoked");
}
} else {
return fallbackPolicy.shouldRetry(e, retries, failovers,
@@ -330,4 +342,34 @@
}
}
+
+ /**
+ * Return a value which is <code>time</code> increasing exponentially as a
+ * function of <code>retries</code>, +/- 0%-50% of that value, chosen
+ * randomly.
+ *
+ * @param time the base amount of time to work with
+ * @param retries the number of retries that have so occurred so far
+ * @param cap value at which to cap the base sleep time
+ * @return an amount of time to sleep
+ */
+ @VisibleForTesting
+ public static long calculateExponentialTime(long time, int retries,
+ long cap) {
+ long baseTime = Math.min(time * ((long)1 << retries), cap);
+ return (long) (baseTime * (RAND.nextFloat() + 0.5));
+ }
+
+ private static long calculateExponentialTime(long time, int retries) {
+ return calculateExponentialTime(time, retries, Long.MAX_VALUE);
+ }
+
+ private static boolean isWrappedStandbyException(Exception e) {
+ if (!(e instanceof RemoteException)) {
+ return false;
+ }
+ Exception unwrapped = ((RemoteException)e).unwrapRemoteException(
+ StandbyException.class);
+ return unwrapped instanceof StandbyException;
+ }
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
index 4c4534f..ed673e9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicy.java
@@ -19,7 +19,6 @@
import org.apache.hadoop.classification.InterfaceStability;
-
/**
* <p>
* Specifies a policy for retrying method failures.
@@ -33,10 +32,39 @@
* Returned by {@link RetryPolicy#shouldRetry(Exception, int, int, boolean)}.
*/
@InterfaceStability.Evolving
- public enum RetryAction {
- FAIL,
- RETRY,
- FAILOVER_AND_RETRY
+ public static class RetryAction {
+
+ // A few common retry policies, with no delays.
+ public static final RetryAction FAIL =
+ new RetryAction(RetryDecision.FAIL);
+ public static final RetryAction RETRY =
+ new RetryAction(RetryDecision.RETRY);
+ public static final RetryAction FAILOVER_AND_RETRY =
+ new RetryAction(RetryDecision.FAILOVER_AND_RETRY);
+
+ public final RetryDecision action;
+ public final long delayMillis;
+ public final String reason;
+
+ public RetryAction(RetryDecision action) {
+ this(action, 0, null);
+ }
+
+ public RetryAction(RetryDecision action, long delayTime) {
+ this(action, delayTime, null);
+ }
+
+ public RetryAction(RetryDecision action, long delayTime, String reason) {
+ this.action = action;
+ this.delayMillis = delayTime;
+ this.reason = reason;
+ }
+
+ public enum RetryDecision {
+ FAIL,
+ RETRY,
+ FAILOVER_AND_RETRY
+ }
}
/**
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java
index f09600d..e5a2d7f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java
@@ -227,6 +227,8 @@
private int maxIdleTime; //connections will be culled if it was idle for
//maxIdleTime msecs
private int maxRetries; //the max. no. of retries for socket connections
+ // the max. no. of retries for socket connections on time out exceptions
+ private int maxRetriesOnSocketTimeouts;
private boolean tcpNoDelay; // if T then disable Nagle's Algorithm
private boolean doPing; //do we need to send ping message
private int pingInterval; // how often sends ping to the server in msecs
@@ -250,6 +252,7 @@
this.rpcTimeout = remoteId.getRpcTimeout();
this.maxIdleTime = remoteId.getMaxIdleTime();
this.maxRetries = remoteId.getMaxRetries();
+ this.maxRetriesOnSocketTimeouts = remoteId.getMaxRetriesOnSocketTimeouts();
this.tcpNoDelay = remoteId.getTcpNoDelay();
this.doPing = remoteId.getDoPing();
this.pingInterval = remoteId.getPingInterval();
@@ -478,11 +481,8 @@
if (updateAddress()) {
timeoutFailures = ioFailures = 0;
}
- /*
- * The max number of retries is 45, which amounts to 20s*45 = 15
- * minutes retries.
- */
- handleConnectionFailure(timeoutFailures++, 45, toe);
+ handleConnectionFailure(timeoutFailures++,
+ maxRetriesOnSocketTimeouts, toe);
} catch (IOException ie) {
if (updateAddress()) {
timeoutFailures = ioFailures = 0;
@@ -1286,6 +1286,8 @@
private final int maxIdleTime; //connections will be culled if it was idle for
//maxIdleTime msecs
private final int maxRetries; //the max. no. of retries for socket connections
+ // the max. no. of retries for socket connections on time out exceptions
+ private final int maxRetriesOnSocketTimeouts;
private final boolean tcpNoDelay; // if T then disable Nagle's Algorithm
private final boolean doPing; //do we need to send ping message
private final int pingInterval; // how often sends ping to the server in msecs
@@ -1293,8 +1295,8 @@
ConnectionId(InetSocketAddress address, Class<?> protocol,
UserGroupInformation ticket, int rpcTimeout,
String serverPrincipal, int maxIdleTime,
- int maxRetries, boolean tcpNoDelay,
- boolean doPing, int pingInterval) {
+ int maxRetries, int maxRetriesOnSocketTimeouts,
+ boolean tcpNoDelay, boolean doPing, int pingInterval) {
this.protocol = protocol;
this.address = address;
this.ticket = ticket;
@@ -1302,6 +1304,7 @@
this.serverPrincipal = serverPrincipal;
this.maxIdleTime = maxIdleTime;
this.maxRetries = maxRetries;
+ this.maxRetriesOnSocketTimeouts = maxRetriesOnSocketTimeouts;
this.tcpNoDelay = tcpNoDelay;
this.doPing = doPing;
this.pingInterval = pingInterval;
@@ -1335,6 +1338,11 @@
return maxRetries;
}
+ /** max connection retries on socket time outs */
+ public int getMaxRetriesOnSocketTimeouts() {
+ return maxRetriesOnSocketTimeouts;
+ }
+
boolean getTcpNoDelay() {
return tcpNoDelay;
}
@@ -1369,6 +1377,9 @@
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_DEFAULT),
conf.getInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT),
+ conf.getInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT),
conf.getBoolean(CommonConfigurationKeysPublic.IPC_CLIENT_TCPNODELAY_KEY,
CommonConfigurationKeysPublic.IPC_CLIENT_TCPNODELAY_DEFAULT),
doPing,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java
new file mode 100644
index 0000000..5bf9dba
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ProtocolTranslator.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ipc;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * An interface implemented by client-side protocol translators to get the
+ * underlying proxy object the translator is operating on.
+ */
+@InterfaceAudience.Private
+public interface ProtocolTranslator {
+
+ /**
+ * Return the proxy object underlying this protocol translator.
+ * @return the proxy object underlying this protocol translator.
+ */
+ public Object getUnderlyingProxyObject();
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java
index 4f85e90..eee364c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java
@@ -40,6 +40,7 @@
import org.apache.commons.logging.*;
import org.apache.hadoop.io.*;
+import org.apache.hadoop.ipc.Client.ConnectionId;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
import org.apache.hadoop.ipc.protobuf.ProtocolInfoProtos.ProtocolInfoService;
import org.apache.hadoop.net.NetUtils;
@@ -530,9 +531,24 @@
* Returns the server address for a given proxy.
*/
public static InetSocketAddress getServerAddress(Object proxy) {
+ return getConnectionIdForProxy(proxy).getAddress();
+ }
+
+ /**
+ * Return the connection ID of the given object. If the provided object is in
+ * fact a protocol translator, we'll get the connection ID of the underlying
+ * proxy object.
+ *
+ * @param proxy the proxy object to get the connection ID of.
+ * @return the connection ID for the provided proxy object.
+ */
+ public static ConnectionId getConnectionIdForProxy(Object proxy) {
+ if (proxy instanceof ProtocolTranslator) {
+ proxy = ((ProtocolTranslator)proxy).getUnderlyingProxyObject();
+ }
RpcInvocationHandler inv = (RpcInvocationHandler) Proxy
.getInvocationHandler(proxy);
- return inv.getConnectionId().getAddress();
+ return inv.getConnectionId();
}
/**
@@ -564,6 +580,12 @@
* @param proxy the RPC proxy object to be stopped
*/
public static void stopProxy(Object proxy) {
+ if (proxy instanceof ProtocolTranslator) {
+ RPC.stopProxy(((ProtocolTranslator)proxy)
+ .getUnderlyingProxyObject());
+ return;
+ }
+
InvocationHandler invocationHandler = null;
try {
invocationHandler = Proxy.getInvocationHandler(proxy);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
index 2b35598..5f642c4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
@@ -1671,6 +1671,10 @@
// on the server side, as opposed to just a normal exceptional
// result.
LOG.warn(logMsg, e);
+ } else if (e instanceof StandbyException) {
+ // Don't log the whole stack trace of these exceptions.
+ // Way too noisy!
+ LOG.info(logMsg);
} else {
LOG.info(logMsg, e);
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java
index 49f4fad..7a16861 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/StandbyException.java
@@ -17,6 +17,8 @@
*/
package org.apache.hadoop.ipc;
+import java.io.IOException;
+
import org.apache.hadoop.classification.InterfaceStability;
/**
@@ -24,7 +26,7 @@
* set of servers in which only a subset may be active.
*/
@InterfaceStability.Evolving
-public class StandbyException extends Exception {
+public class StandbyException extends IOException {
static final long serialVersionUID = 0x12308AD010L;
public StandbyException(String msg) {
super(msg);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
index 2685887..43132d2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
@@ -23,6 +23,7 @@
import java.net.URL;
import java.net.UnknownHostException;
import java.security.AccessController;
+import java.security.PrivilegedAction;
import java.util.Arrays;
import java.util.List;
import java.util.ServiceLoader;
@@ -449,6 +450,27 @@
}
/**
+ * Perform the given action as the daemon's login user. If the login
+ * user cannot be determined, this will log a FATAL error and exit
+ * the whole JVM.
+ */
+ public static <T> T doAsLoginUserOrFatal(PrivilegedAction<T> action) {
+ if (UserGroupInformation.isSecurityEnabled()) {
+ UserGroupInformation ugi = null;
+ try {
+ ugi = UserGroupInformation.getLoginUser();
+ } catch (IOException e) {
+ LOG.fatal("Exception while getting login user", e);
+ e.printStackTrace();
+ Runtime.getRuntime().exit(-1);
+ }
+ return ugi.doAs(action);
+ } else {
+ return action.run();
+ }
+ }
+
+ /**
* Resolves a host subject to the security requirements determined by
* hadoop.security.token.service.use_ip.
*
@@ -597,5 +619,5 @@
void setSearchDomains(String ... domains) {
searchDomains = Arrays.asList(domains);
}
- }
+ }
}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java
index 3c2e666..11df981 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/AbstractDelegationTokenSecretManager.java
@@ -40,6 +40,8 @@
import org.apache.hadoop.security.token.SecretManager;
import org.apache.hadoop.util.Daemon;
+import com.google.common.base.Preconditions;
+
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
@InterfaceStability.Evolving
public abstract
@@ -84,6 +86,12 @@
private Thread tokenRemoverThread;
protected volatile boolean running;
+ /**
+ * If the delegation token update thread holds this lock, it will
+ * not get interrupted.
+ */
+ protected Object noInterruptsLock = new Object();
+
public AbstractDelegationTokenSecretManager(long delegationKeyUpdateInterval,
long delegationTokenMaxLifetime, long delegationTokenRenewInterval,
long delegationTokenRemoverScanInterval) {
@@ -95,6 +103,7 @@
/** should be called before this object is used */
public void startThreads() throws IOException {
+ Preconditions.checkState(!running);
updateCurrentKey();
synchronized (this) {
running = true;
@@ -354,12 +363,21 @@
}
}
- public synchronized void stopThreads() {
+ public void stopThreads() {
if (LOG.isDebugEnabled())
LOG.debug("Stopping expired delegation token remover thread");
running = false;
+
if (tokenRemoverThread != null) {
- tokenRemoverThread.interrupt();
+ synchronized (noInterruptsLock) {
+ tokenRemoverThread.interrupt();
+ }
+ try {
+ tokenRemoverThread.join();
+ } catch (InterruptedException e) {
+ throw new RuntimeException(
+ "Unable to join on token removal thread", e);
+ }
}
}
@@ -395,7 +413,7 @@
lastTokenCacheCleanup = now;
}
try {
- Thread.sleep(5000); // 5 seconds
+ Thread.sleep(Math.min(5000, keyUpdateInterval)); // 5 seconds
} catch (InterruptedException ie) {
LOG
.error("InterruptedExcpetion recieved for ExpiredTokenRemover thread "
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java
new file mode 100644
index 0000000..6e4dfaf
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ThreadUtil.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.classification.InterfaceStability;
+
+@InterfaceStability.Evolving
+public class ThreadUtil {
+
+ private static final Log LOG = LogFactory.getLog(ThreadUtil.class);
+
+ /**
+ * Cause the current thread to sleep as close as possible to the provided
+ * number of milliseconds. This method will log and ignore any
+ * {@link InterruptedException} encountered.
+ *
+ * @param millis the number of milliseconds for the current thread to sleep
+ */
+ public static void sleepAtLeastIgnoreInterrupts(long millis) {
+ long start = System.currentTimeMillis();
+ while (System.currentTimeMillis() - start < millis) {
+ long timeToSleep = millis -
+ (System.currentTimeMillis() - start);
+ try {
+ Thread.sleep(timeToSleep);
+ } catch (InterruptedException ie) {
+ LOG.warn("interrupted while sleeping", ie);
+ }
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml b/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml
index b3e12d1..2fd9f8d 100644
--- a/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml
+++ b/hadoop-common-project/hadoop-common/src/main/packages/templates/conf/hadoop-policy.xml
@@ -216,6 +216,13 @@
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
+
+ <property>
+ <name>security.ha.service.protocol.acl</name>
+ <value>*</value>
+ <description>ACL for HAService protocol used by HAAdmin to manage the
+ active and stand-by states of namenode.</description>
+ </property>
<property>
<name>security.mrhs.client.protocol.acl</name>
diff --git a/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto b/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto
new file mode 100644
index 0000000..a3fd86c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/proto/HAServiceProtocol.proto
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+option java_package = "org.apache.hadoop.ha.proto";
+option java_outer_classname = "HAServiceProtocolProtos";
+option java_generic_services = true;
+option java_generate_equals_and_hash = true;
+
+enum HAServiceStateProto {
+ INITIALIZING = 0;
+ ACTIVE = 1;
+ STANDBY = 2;
+}
+
+/**
+ * void request
+ */
+message MonitorHealthRequestProto {
+}
+
+/**
+ * void response
+ */
+message MonitorHealthResponseProto {
+}
+
+/**
+ * void request
+ */
+message TransitionToActiveRequestProto {
+}
+
+/**
+ * void response
+ */
+message TransitionToActiveResponseProto {
+}
+
+/**
+ * void request
+ */
+message TransitionToStandbyRequestProto {
+}
+
+/**
+ * void response
+ */
+message TransitionToStandbyResponseProto {
+}
+
+/**
+ * void request
+ */
+message GetServiceStateRequestProto {
+}
+
+/**
+ * Returns the state of the service
+ */
+message GetServiceStateResponseProto {
+ required HAServiceStateProto state = 1;
+}
+
+/**
+ * void request
+ */
+message ReadyToBecomeActiveRequestProto {
+}
+
+/**
+ * Returns true if service is ready to become active
+ */
+message ReadyToBecomeActiveResponseProto {
+ required bool readyToBecomeActive = 1;
+}
+
+/**
+ * Protocol interface provides High availability related
+ * primitives to monitor and failover a service.
+ *
+ * For details see o.a.h.ha.HAServiceProtocol.
+ */
+service HAServiceProtocolService {
+ /**
+ * Monitor the health of a service.
+ */
+ rpc monitorHealth(MonitorHealthRequestProto)
+ returns(MonitorHealthResponseProto);
+
+ /**
+ * Request service to tranisition to active state.
+ */
+ rpc transitionToActive(TransitionToActiveRequestProto)
+ returns(TransitionToActiveResponseProto);
+
+ /**
+ * Request service to transition to standby state.
+ */
+ rpc transitionToStandby(TransitionToStandbyRequestProto)
+ returns(TransitionToStandbyResponseProto);
+
+ /**
+ * Get the current state of the service.
+ */
+ rpc getServiceState(GetServiceStateRequestProto)
+ returns(GetServiceStateResponseProto);
+
+ /**
+ * Check if the service is ready to become active
+ */
+ rpc readyToBecomeActive(ReadyToBecomeActiveRequestProto)
+ returns(ReadyToBecomeActiveResponseProto);
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
index 8fc45c5..a968400 100644
--- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
+++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@@ -488,6 +488,14 @@
</property>
<property>
+ <name>ipc.client.connect.max.retries.on.timeouts</name>
+ <value>45</value>
+ <description>Indicates the number of retries a client will make on socket timeout
+ to establish a server connection.
+ </description>
+</property>
+
+<property>
<name>ipc.server.listen.queue.size</name>
<value>128</value>
<description>Indicates the length of the listen queue for servers accepting
@@ -849,4 +857,30 @@
</description>
</property>
+<property>
+ <name>dfs.ha.fencing.methods</name>
+ <value></value>
+ <description>
+ List of fencing methods to use for service fencing. May contain
+ builtin methods (eg shell and sshfence) or user-defined method.
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.fencing.ssh.connect-timeout</name>
+ <value>30000</value>
+ <description>
+ SSH connection timeout, in milliseconds, to use with the builtin
+ sshfence fencer.
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.fencing.ssh.private-key-files</name>
+ <value></value>
+ <description>
+ The SSH private key files to use with the builtin sshfence fencer.
+ </description>
+</property>
+
</configuration>
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
new file mode 100644
index 0000000..fec350d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElector.java
@@ -0,0 +1,527 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.zookeeper.AsyncCallback;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.KeeperException.Code;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.Watcher.Event;
+import org.apache.zookeeper.data.ACL;
+import org.apache.zookeeper.data.Stat;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.Assert;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
+import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
+
+public class TestActiveStandbyElector {
+
+ static ZooKeeper mockZK;
+ static int count;
+ static ActiveStandbyElectorCallback mockApp;
+ static final byte[] data = new byte[8];
+
+ ActiveStandbyElectorTester elector;
+
+ class ActiveStandbyElectorTester extends ActiveStandbyElector {
+ ActiveStandbyElectorTester(String hostPort, int timeout, String parent,
+ List<ACL> acl, ActiveStandbyElectorCallback app) throws IOException {
+ super(hostPort, timeout, parent, acl, app);
+ }
+
+ @Override
+ public ZooKeeper getNewZooKeeper() {
+ ++TestActiveStandbyElector.count;
+ return TestActiveStandbyElector.mockZK;
+ }
+
+ }
+
+ private static final String zkParentName = "/zookeeper";
+ private static final String zkLockPathName = "/zookeeper/"
+ + ActiveStandbyElector.LOCKFILENAME;
+
+ @Before
+ public void init() throws IOException {
+ count = 0;
+ mockZK = Mockito.mock(ZooKeeper.class);
+ mockApp = Mockito.mock(ActiveStandbyElectorCallback.class);
+ elector = new ActiveStandbyElectorTester("hostPort", 1000, zkParentName,
+ Ids.OPEN_ACL_UNSAFE, mockApp);
+ }
+
+ /**
+ * verify that joinElection checks for null data
+ */
+ @Test(expected = HadoopIllegalArgumentException.class)
+ public void testJoinElectionException() {
+ elector.joinElection(null);
+ }
+
+ /**
+ * verify that joinElection tries to create ephemeral lock znode
+ */
+ @Test
+ public void testJoinElection() {
+ elector.joinElection(data);
+ Mockito.verify(mockZK, Mockito.times(1)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ }
+
+ /**
+ * verify that successful znode create result becomes active and monitoring is
+ * started
+ */
+ @Test
+ public void testCreateNodeResultBecomeActive() {
+ elector.joinElection(data);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeActive();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ // monitor callback verifies the leader is ephemeral owner of lock but does
+ // not call becomeActive since its already active
+ Stat stat = new Stat();
+ stat.setEphemeralOwner(1L);
+ Mockito.when(mockZK.getSessionId()).thenReturn(1L);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat);
+ // should not call neutral mode/standby/active
+ Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
+ Mockito.verify(mockApp, Mockito.times(0)).becomeStandby();
+ Mockito.verify(mockApp, Mockito.times(1)).becomeActive();
+ // another joinElection not called.
+ Mockito.verify(mockZK, Mockito.times(1)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ // no new monitor called
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+ }
+
+ /**
+ * verify that znode create for existing node and no retry becomes standby and
+ * monitoring is started
+ */
+ @Test
+ public void testCreateNodeResultBecomeStandby() {
+ elector.joinElection(data);
+
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+ }
+
+ /**
+ * verify that znode create error result in fatal error
+ */
+ @Test
+ public void testCreateNodeResultError() {
+ elector.joinElection(data);
+
+ elector.processResult(Code.APIERROR.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError(
+ "Received create error from Zookeeper. code:APIERROR");
+ }
+
+ /**
+ * verify that retry of network errors verifies master by session id and
+ * becomes active if they match. monitoring is started.
+ */
+ @Test
+ public void testCreateNodeResultRetryBecomeActive() {
+ elector.joinElection(data);
+
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ // 4 errors results in fatalError
+ Mockito
+ .verify(mockApp, Mockito.times(1))
+ .notifyFatalError(
+ "Received create error from Zookeeper. code:CONNECTIONLOSS. "+
+ "Not retrying further znode create connection errors.");
+
+ elector.joinElection(data);
+ // recreate connection via getNewZooKeeper
+ Assert.assertEquals(2, TestActiveStandbyElector.count);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ Stat stat = new Stat();
+ stat.setEphemeralOwner(1L);
+ Mockito.when(mockZK.getSessionId()).thenReturn(1L);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeActive();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+ Mockito.verify(mockZK, Mockito.times(6)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ }
+
+ /**
+ * verify that retry of network errors verifies active by session id and
+ * becomes standby if they dont match. monitoring is started.
+ */
+ @Test
+ public void testCreateNodeResultRetryBecomeStandby() {
+ elector.joinElection(data);
+
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ Stat stat = new Stat();
+ stat.setEphemeralOwner(0);
+ Mockito.when(mockZK.getSessionId()).thenReturn(1L);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null, stat);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+ }
+
+ /**
+ * verify that if create znode results in nodeexists and that znode is deleted
+ * before exists() watch is set then the return of the exists() method results
+ * in attempt to re-create the znode and become active
+ */
+ @Test
+ public void testCreateNodeResultRetryNoNode() {
+ elector.joinElection(data);
+
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ elector.processResult(Code.NONODE.intValue(), zkLockPathName, null,
+ (Stat) null);
+ Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode();
+ Mockito.verify(mockZK, Mockito.times(4)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ }
+
+ /**
+ * verify that more than 3 network error retries result fatalError
+ */
+ @Test
+ public void testStatNodeRetry() {
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ (Stat) null);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ (Stat) null);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ (Stat) null);
+ elector.processResult(Code.CONNECTIONLOSS.intValue(), zkLockPathName, null,
+ (Stat) null);
+ Mockito
+ .verify(mockApp, Mockito.times(1))
+ .notifyFatalError(
+ "Received stat error from Zookeeper. code:CONNECTIONLOSS. "+
+ "Not retrying further znode monitoring connection errors.");
+ }
+
+ /**
+ * verify error in exists() callback results in fatal error
+ */
+ @Test
+ public void testStatNodeError() {
+ elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), zkLockPathName,
+ null, (Stat) null);
+ Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
+ Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError(
+ "Received stat error from Zookeeper. code:RUNTIMEINCONSISTENCY");
+ }
+
+ /**
+ * verify behavior of watcher.process callback with non-node event
+ */
+ @Test
+ public void testProcessCallbackEventNone() {
+ elector.joinElection(data);
+
+ WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class);
+ Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.None);
+
+ // first SyncConnected should not do anything
+ Mockito.when(mockEvent.getState()).thenReturn(
+ Event.KeeperState.SyncConnected);
+ elector.process(mockEvent);
+ Mockito.verify(mockZK, Mockito.times(0)).exists(Mockito.anyString(),
+ Mockito.anyBoolean(), Mockito.<AsyncCallback.StatCallback> anyObject(),
+ Mockito.<Object> anyObject());
+
+ // disconnection should enter safe mode
+ Mockito.when(mockEvent.getState()).thenReturn(
+ Event.KeeperState.Disconnected);
+ elector.process(mockEvent);
+ Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode();
+
+ // re-connection should monitor master status
+ Mockito.when(mockEvent.getState()).thenReturn(
+ Event.KeeperState.SyncConnected);
+ elector.process(mockEvent);
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ // session expired should enter safe mode and initiate re-election
+ // re-election checked via checking re-creation of new zookeeper and
+ // call to create lock znode
+ Mockito.when(mockEvent.getState()).thenReturn(Event.KeeperState.Expired);
+ elector.process(mockEvent);
+ // already in safe mode above. should not enter safe mode again
+ Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode();
+ // called getNewZooKeeper to create new session. first call was in
+ // constructor
+ Assert.assertEquals(2, TestActiveStandbyElector.count);
+ // once in initial joinElection and one now
+ Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+
+ // create znode success. become master and monitor
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeActive();
+ Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true,
+ elector, null);
+
+ // error event results in fatal error
+ Mockito.when(mockEvent.getState()).thenReturn(Event.KeeperState.AuthFailed);
+ elector.process(mockEvent);
+ Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError(
+ "Unexpected Zookeeper watch event state: AuthFailed");
+ // only 1 state change callback is called at a time
+ Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode();
+ }
+
+ /**
+ * verify behavior of watcher.process with node event
+ */
+ @Test
+ public void testProcessCallbackEventNode() {
+ elector.joinElection(data);
+
+ // make the object go into the monitoring state
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class);
+ Mockito.when(mockEvent.getPath()).thenReturn(zkLockPathName);
+
+ // monitoring should be setup again after event is received
+ Mockito.when(mockEvent.getType()).thenReturn(
+ Event.EventType.NodeDataChanged);
+ elector.process(mockEvent);
+ Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true,
+ elector, null);
+
+ // monitoring should be setup again after event is received
+ Mockito.when(mockEvent.getType()).thenReturn(
+ Event.EventType.NodeChildrenChanged);
+ elector.process(mockEvent);
+ Mockito.verify(mockZK, Mockito.times(3)).exists(zkLockPathName, true,
+ elector, null);
+
+ // lock node deletion when in standby mode should create znode again
+ // successful znode creation enters active state and sets monitor
+ Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted);
+ elector.process(mockEvent);
+ // enterNeutralMode not called when app is standby and leader is lost
+ Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
+ // once in initial joinElection() and one now
+ Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeActive();
+ Mockito.verify(mockZK, Mockito.times(4)).exists(zkLockPathName, true,
+ elector, null);
+
+ // lock node deletion in active mode should enter neutral mode and create
+ // znode again successful znode creation enters active state and sets
+ // monitor
+ Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted);
+ elector.process(mockEvent);
+ Mockito.verify(mockApp, Mockito.times(1)).enterNeutralMode();
+ // another joinElection called
+ Mockito.verify(mockZK, Mockito.times(3)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ elector.processResult(Code.OK.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(2)).becomeActive();
+ Mockito.verify(mockZK, Mockito.times(5)).exists(zkLockPathName, true,
+ elector, null);
+
+ // bad path name results in fatal error
+ Mockito.when(mockEvent.getPath()).thenReturn(null);
+ elector.process(mockEvent);
+ Mockito.verify(mockApp, Mockito.times(1)).notifyFatalError(
+ "Unexpected watch error from Zookeeper");
+ // fatal error means no new connection other than one from constructor
+ Assert.assertEquals(1, TestActiveStandbyElector.count);
+ // no new watches after fatal error
+ Mockito.verify(mockZK, Mockito.times(5)).exists(zkLockPathName, true,
+ elector, null);
+
+ }
+
+ /**
+ * verify becomeStandby is not called if already in standby
+ */
+ @Test
+ public void testSuccessiveStandbyCalls() {
+ elector.joinElection(data);
+
+ // make the object go into the monitoring standby state
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ WatchedEvent mockEvent = Mockito.mock(WatchedEvent.class);
+ Mockito.when(mockEvent.getPath()).thenReturn(zkLockPathName);
+
+ // notify node deletion
+ // monitoring should be setup again after event is received
+ Mockito.when(mockEvent.getType()).thenReturn(Event.EventType.NodeDeleted);
+ elector.process(mockEvent);
+ // is standby. no need to notify anything now
+ Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
+ // another joinElection called.
+ Mockito.verify(mockZK, Mockito.times(2)).create(zkLockPathName, data,
+ Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL, elector, null);
+ // lost election
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ // still standby. so no need to notify again
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ // monitor is set again
+ Mockito.verify(mockZK, Mockito.times(2)).exists(zkLockPathName, true,
+ elector, null);
+ }
+
+ /**
+ * verify quit election terminates connection and there are no new watches.
+ * next call to joinElection creates new connection and performs election
+ */
+ @Test
+ public void testQuitElection() throws InterruptedException {
+ elector.quitElection();
+ Mockito.verify(mockZK, Mockito.times(1)).close();
+ // no watches added
+ Mockito.verify(mockZK, Mockito.times(0)).exists(zkLockPathName, true,
+ elector, null);
+
+ byte[] data = new byte[8];
+ elector.joinElection(data);
+ // getNewZooKeeper called 2 times. once in constructor and once now
+ Assert.assertEquals(2, TestActiveStandbyElector.count);
+ elector.processResult(Code.NODEEXISTS.intValue(), zkLockPathName, null,
+ zkLockPathName);
+ Mockito.verify(mockApp, Mockito.times(1)).becomeStandby();
+ Mockito.verify(mockZK, Mockito.times(1)).exists(zkLockPathName, true,
+ elector, null);
+
+ }
+
+ /**
+ * verify that receiveActiveData gives data when active exists, tells that
+ * active does not exist and reports error in getting active information
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws KeeperException
+ * @throws ActiveNotFoundException
+ */
+ @Test
+ public void testGetActiveData() throws ActiveNotFoundException,
+ KeeperException, InterruptedException, IOException {
+ // get valid active data
+ byte[] data = new byte[8];
+ Mockito.when(
+ mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject())).thenReturn(data);
+ Assert.assertEquals(data, elector.getActiveData());
+ Mockito.verify(mockZK, Mockito.times(1)).getData(
+ Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject());
+
+ // active does not exist
+ Mockito.when(
+ mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject())).thenThrow(
+ new KeeperException.NoNodeException());
+ try {
+ elector.getActiveData();
+ Assert.fail("ActiveNotFoundException expected");
+ } catch(ActiveNotFoundException e) {
+ Mockito.verify(mockZK, Mockito.times(2)).getData(
+ Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject());
+ }
+
+ // error getting active data rethrows keeperexception
+ try {
+ Mockito.when(
+ mockZK.getData(Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject())).thenThrow(
+ new KeeperException.AuthFailedException());
+ elector.getActiveData();
+ Assert.fail("KeeperException.AuthFailedException expected");
+ } catch(KeeperException.AuthFailedException ke) {
+ Mockito.verify(mockZK, Mockito.times(3)).getData(
+ Mockito.eq(zkLockPathName), Mockito.eq(false),
+ Mockito.<Stat> anyObject());
+ }
+ }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
new file mode 100644
index 0000000..672e8d3
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestActiveStandbyElectorRealZK.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ha;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
+import org.apache.zookeeper.CreateMode;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.ZooDefs.Ids;
+import org.apache.zookeeper.data.ACL;
+import org.apache.zookeeper.test.ClientBase;
+
+/**
+ * Test for {@link ActiveStandbyElector} using real zookeeper.
+ */
+public class TestActiveStandbyElectorRealZK extends ClientBase {
+ static final int NUM_ELECTORS = 2;
+ static ZooKeeper[] zkClient = new ZooKeeper[NUM_ELECTORS];
+ static int currentClientIndex = 0;
+
+ @Override
+ public void setUp() throws Exception {
+ // build.test.dir is used by zookeeper
+ new File(System.getProperty("build.test.dir", "build")).mkdirs();
+ super.setUp();
+ }
+
+ class ActiveStandbyElectorTesterRealZK extends ActiveStandbyElector {
+ ActiveStandbyElectorTesterRealZK(String hostPort, int timeout,
+ String parent, List<ACL> acl, ActiveStandbyElectorCallback app)
+ throws IOException {
+ super(hostPort, timeout, parent, acl, app);
+ }
+
+ @Override
+ public ZooKeeper getNewZooKeeper() {
+ return TestActiveStandbyElectorRealZK.zkClient[
+ TestActiveStandbyElectorRealZK.currentClientIndex];
+ }
+ }
+
+ /**
+ * The class object runs on a thread and waits for a signal to start from the
+ * test object. On getting the signal it joins the election and thus by doing
+ * this on multiple threads we can test simultaneous attempts at leader lock
+ * creation. after joining the election, the object waits on a signal to exit.
+ * this signal comes when the object's elector has become a leader or there is
+ * an unexpected fatal error. this lets another thread object to become a
+ * leader.
+ */
+ class ThreadRunner implements Runnable, ActiveStandbyElectorCallback {
+ int index;
+ TestActiveStandbyElectorRealZK test;
+ boolean wait = true;
+
+ ThreadRunner(int i, TestActiveStandbyElectorRealZK s) {
+ index = i;
+ test = s;
+ }
+
+ @Override
+ public void run() {
+ LOG.info("starting " + index);
+ while(true) {
+ synchronized (test) {
+ // wait for test start signal to come
+ if (!test.start) {
+ try {
+ test.wait();
+ } catch(InterruptedException e) {
+ Assert.fail(e.getMessage());
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ // join election
+ byte[] data = new byte[8];
+ ActiveStandbyElector elector = test.elector[index];
+ LOG.info("joining " + index);
+ elector.joinElection(data);
+ try {
+ while(true) {
+ synchronized (this) {
+ // wait for elector to become active/fatal error
+ if (wait) {
+ // wait to become active
+ // wait capped at 30s to prevent hung test
+ wait(30000);
+ } else {
+ break;
+ }
+ }
+ }
+ Thread.sleep(1000);
+ // quit election to allow other elector to become active
+ elector.quitElection();
+ } catch(InterruptedException e) {
+ Assert.fail(e.getMessage());
+ }
+ LOG.info("ending " + index);
+ }
+
+ @Override
+ public synchronized void becomeActive() {
+ test.reportActive(index);
+ LOG.info("active " + index);
+ wait = false;
+ notifyAll();
+ }
+
+ @Override
+ public synchronized void becomeStandby() {
+ test.reportStandby(index);
+ LOG.info("standby " + index);
+ }
+
+ @Override
+ public synchronized void enterNeutralMode() {
+ LOG.info("neutral " + index);
+ }
+
+ @Override
+ public synchronized void notifyFatalError(String errorMessage) {
+ LOG.info("fatal " + index + " .Error message:" + errorMessage);
+ wait = false;
+ notifyAll();
+ }
+ }
+
+ boolean start = false;
+ int activeIndex = -1;
+ int standbyIndex = -1;
+ String parentDir = "/" + java.util.UUID.randomUUID().toString();
+
+ ActiveStandbyElector[] elector = new ActiveStandbyElector[NUM_ELECTORS];
+ ThreadRunner[] threadRunner = new ThreadRunner[NUM_ELECTORS];
+ Thread[] thread = new Thread[NUM_ELECTORS];
+
+ synchronized void reportActive(int index) {
+ if (activeIndex == -1) {
+ activeIndex = index;
+ } else {
+ // standby should become active
+ Assert.assertEquals(standbyIndex, index);
+ // old active should not become active
+ Assert.assertFalse(activeIndex == index);
+ }
+ activeIndex = index;
+ }
+
+ synchronized void reportStandby(int index) {
+ // only 1 standby should be reported and it should not be the same as active
+ Assert.assertEquals(-1, standbyIndex);
+ standbyIndex = index;
+ Assert.assertFalse(activeIndex == standbyIndex);
+ }
+
+ /**
+ * the test creates 2 electors which try to become active using a real
+ * zookeeper server. It verifies that 1 becomes active and 1 becomes standby.
+ * Upon becoming active the leader quits election and the test verifies that
+ * the standby now becomes active. these electors run on different threads and
+ * callback to the test class to report active and standby where the outcome
+ * is verified
+ *
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws KeeperException
+ */
+ @Test
+ public void testActiveStandbyTransition() throws IOException,
+ InterruptedException, KeeperException {
+ LOG.info("starting test with parentDir:" + parentDir);
+ start = false;
+ byte[] data = new byte[8];
+ // create random working directory
+ createClient().create(parentDir, data, Ids.OPEN_ACL_UNSAFE,
+ CreateMode.PERSISTENT);
+
+ for(currentClientIndex = 0;
+ currentClientIndex < NUM_ELECTORS;
+ ++currentClientIndex) {
+ LOG.info("creating " + currentClientIndex);
+ zkClient[currentClientIndex] = createClient();
+ threadRunner[currentClientIndex] = new ThreadRunner(currentClientIndex,
+ this);
+ elector[currentClientIndex] = new ActiveStandbyElectorTesterRealZK(
+ "hostPort", 1000, parentDir, Ids.OPEN_ACL_UNSAFE,
+ threadRunner[currentClientIndex]);
+ zkClient[currentClientIndex].register(elector[currentClientIndex]);
+ thread[currentClientIndex] = new Thread(threadRunner[currentClientIndex]);
+ thread[currentClientIndex].start();
+ }
+
+ synchronized (this) {
+ // signal threads to start
+ LOG.info("signaling threads");
+ start = true;
+ notifyAll();
+ }
+
+ for(int i = 0; i < thread.length; i++) {
+ thread[i].join();
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java
new file mode 100644
index 0000000..9e2cc75
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java
@@ -0,0 +1,441 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB;
+import org.apache.hadoop.ha.TestNodeFencer.AlwaysSucceedFencer;
+import org.apache.hadoop.ha.TestNodeFencer.AlwaysFailFencer;
+import static org.apache.hadoop.ha.TestNodeFencer.setupFencer;
+import org.apache.hadoop.ipc.ProtocolSignature;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.AccessControlException;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestFailoverController {
+
+ private InetSocketAddress svc1Addr = new InetSocketAddress("svc1", 1234);
+ private InetSocketAddress svc2Addr = new InetSocketAddress("svc2", 5678);
+
+ private class DummyService implements HAServiceProtocol {
+ HAServiceState state;
+
+ DummyService(HAServiceState state) {
+ this.state = state;
+ }
+
+ @Override
+ public void monitorHealth() throws HealthCheckFailedException, IOException {
+ // Do nothing
+ }
+
+ @Override
+ public void transitionToActive() throws ServiceFailedException, IOException {
+ state = HAServiceState.ACTIVE;
+ }
+
+ @Override
+ public void transitionToStandby() throws ServiceFailedException, IOException {
+ state = HAServiceState.STANDBY;
+ }
+
+ @Override
+ public HAServiceState getServiceState() throws IOException {
+ return state;
+ }
+
+ @Override
+ public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
+ return true;
+ }
+ }
+
+ @Test
+ public void testFailoverAndFailback() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ AlwaysSucceedFencer.fenceCalled = 0;
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+
+ AlwaysSucceedFencer.fenceCalled = 0;
+ FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false);
+ assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverFromStandbyToStandby() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.STANDBY);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverFromActiveToActive() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.ACTIVE);
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Can't failover to an already active service");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverWithoutPermission() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
+ @Override
+ public HAServiceState getServiceState() throws IOException {
+ throw new AccessControlException("Access denied");
+ }
+ };
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public HAServiceState getServiceState() throws IOException {
+ throw new AccessControlException("Access denied");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Can't failover when access is denied");
+ } catch (FailoverFailedException ffe) {
+ assertTrue(ffe.getCause().getMessage().contains("Access denied"));
+ }
+ }
+
+
+ @Test
+ public void testFailoverToUnreadyService() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
+ return false;
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Can't failover to a service that's not ready");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+
+ // Forcing it means we ignore readyToBecomeActive
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true);
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void monitorHealth() throws HealthCheckFailedException {
+ throw new HealthCheckFailedException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failover to unhealthy service");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverFromFaultyServiceSucceeds() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
+ @Override
+ public void transitionToStandby() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ AlwaysSucceedFencer.fenceCalled = 0;
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ } catch (FailoverFailedException ffe) {
+ fail("Faulty active prevented failover");
+ }
+
+ // svc1 still thinks it's active, that's OK, it was fenced
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("svc1:1234", AlwaysSucceedFencer.fencedSvc);
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverFromFaultyServiceFencingFailure() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
+ @Override
+ public void transitionToStandby() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName());
+
+ AlwaysFailFencer.fenceCalled = 0;
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failed over even though fencing failed");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ assertEquals(1, AlwaysFailFencer.fenceCalled);
+ assertEquals("svc1:1234", AlwaysFailFencer.fencedSvc);
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFencingFailureDuringFailover() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName());
+
+ AlwaysFailFencer.fenceCalled = 0;
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
+ fail("Failed over even though fencing requested and failed");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ // If fencing was requested and it failed we don't try to make
+ // svc2 active anyway, and we don't failback to svc1.
+ assertEquals(1, AlwaysFailFencer.fenceCalled);
+ assertEquals("svc1:1234", AlwaysFailFencer.fencedSvc);
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ private HAServiceProtocol getProtocol(String target)
+ throws IOException {
+ InetSocketAddress addr = NetUtils.createSocketAddr(target);
+ Configuration conf = new Configuration();
+ // Lower the timeout so we quickly fail to connect
+ conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 1);
+ return new HAServiceProtocolClientSideTranslatorPB(addr, conf);
+ }
+
+ @Test
+ public void testFailoverFromNonExistantServiceWithFencer() throws Exception {
+ HAServiceProtocol svc1 = getProtocol("localhost:1234");
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY);
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ } catch (FailoverFailedException ffe) {
+ fail("Non-existant active prevented failover");
+ }
+
+ // Don't check svc1 because we can't reach it, but that's OK, it's been fenced.
+ assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
+ }
+
+ @Test
+ public void testFailoverToNonExistantServiceFails() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ HAServiceProtocol svc2 = getProtocol("localhost:1234");
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failed over to a non-existant standby");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ }
+
+ @Test
+ public void testFailoverToFaultyServiceFailsbackOK() throws Exception {
+ DummyService svc1 = spy(new DummyService(HAServiceState.ACTIVE));
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failover to already active service");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ // svc1 went standby then back to active
+ verify(svc1).transitionToStandby();
+ verify(svc1).transitionToActive();
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ @Test
+ public void testWeDontFailbackIfActiveWasFenced() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
+ fail("Failed over to service that won't transition to active");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ // We failed to failover and did not failback because we fenced
+ // svc1 (we forced it), therefore svc1 and svc2 should be standby.
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+
+ @Test
+ public void testWeFenceOnFailbackIfTransitionToActiveFails() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException, IOException {
+ throw new IOException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+ AlwaysSucceedFencer.fenceCalled = 0;
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failed over to service that won't transition to active");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ // We failed to failover. We did not fence svc1 because it cooperated
+ // and we didn't force it, so we failed back to svc1 and fenced svc2.
+ // Note svc2 still thinks it's active, that's OK, we fenced it.
+ assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("svc2:5678", AlwaysSucceedFencer.fencedSvc);
+ }
+
+ @Test
+ public void testFailureToFenceOnFailbackFailsTheFailback() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException, IOException {
+ throw new IOException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysFailFencer.class.getName());
+ AlwaysFailFencer.fenceCalled = 0;
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failed over to service that won't transition to active");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ // We did not fence svc1 because it cooperated and we didn't force it,
+ // we failed to failover so we fenced svc2, we failed to fence svc2
+ // so we did not failback to svc1, ie it's still standby.
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(1, AlwaysFailFencer.fenceCalled);
+ assertEquals("svc2:5678", AlwaysFailFencer.fencedSvc);
+ }
+
+ @Test
+ public void testFailbackToFaultyServiceFails() throws Exception {
+ DummyService svc1 = new DummyService(HAServiceState.ACTIVE) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
+ @Override
+ public void transitionToActive() throws ServiceFailedException {
+ throw new ServiceFailedException("Failed!");
+ }
+ };
+ NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
+
+ try {
+ FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
+ fail("Failover to already active service");
+ } catch (FailoverFailedException ffe) {
+ // Expected
+ }
+
+ assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
+ assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java
new file mode 100644
index 0000000..f22056a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.conf.Configuration;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+import static org.mockito.Mockito.when;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Joiner;
+
+public class TestHAAdmin {
+ private static final Log LOG = LogFactory.getLog(TestHAAdmin.class);
+
+ private HAAdmin tool;
+ private ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream();
+ private String errOutput;
+ private HAServiceProtocol mockProtocol;
+
+ @Before
+ public void setup() throws IOException {
+ mockProtocol = Mockito.mock(HAServiceProtocol.class);
+ when(mockProtocol.readyToBecomeActive()).thenReturn(true);
+ tool = new HAAdmin() {
+ @Override
+ protected HAServiceProtocol getProtocol(String target) throws IOException {
+ return mockProtocol;
+ }
+ };
+ tool.setConf(new Configuration());
+ tool.errOut = new PrintStream(errOutBytes);
+ }
+
+ private void assertOutputContains(String string) {
+ if (!errOutput.contains(string)) {
+ fail("Expected output to contain '" + string + "' but was:\n" +
+ errOutput);
+ }
+ }
+
+ @Test
+ public void testAdminUsage() throws Exception {
+ assertEquals(-1, runTool());
+ assertOutputContains("Usage:");
+ assertOutputContains("-transitionToActive");
+
+ assertEquals(-1, runTool("badCommand"));
+ assertOutputContains("Bad command 'badCommand'");
+
+ assertEquals(-1, runTool("-badCommand"));
+ assertOutputContains("badCommand: Unknown");
+
+ // valid command but not enough arguments
+ assertEquals(-1, runTool("-transitionToActive"));
+ assertOutputContains("transitionToActive: incorrect number of arguments");
+ assertEquals(-1, runTool("-transitionToActive", "x", "y"));
+ assertOutputContains("transitionToActive: incorrect number of arguments");
+ assertEquals(-1, runTool("-failover"));
+ assertOutputContains("failover: incorrect arguments");
+ assertOutputContains("failover: incorrect arguments");
+ assertEquals(-1, runTool("-failover", "foo:1234"));
+ assertOutputContains("failover: incorrect arguments");
+ }
+
+ @Test
+ public void testHelp() throws Exception {
+ assertEquals(-1, runTool("-help"));
+ assertEquals(0, runTool("-help", "transitionToActive"));
+ assertOutputContains("Transitions the service into Active");
+ }
+
+ private Object runTool(String ... args) throws Exception {
+ errOutBytes.reset();
+ LOG.info("Running: HAAdmin " + Joiner.on(" ").join(args));
+ int ret = tool.run(args);
+ errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8);
+ LOG.info("Output:\n" + errOutput);
+ return ret;
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java
new file mode 100644
index 0000000..5508547
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestNodeFencer.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import static org.junit.Assert.*;
+
+import java.net.InetSocketAddress;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class TestNodeFencer {
+
+ @Before
+ public void clearMockState() {
+ AlwaysSucceedFencer.fenceCalled = 0;
+ AlwaysSucceedFencer.callArgs.clear();
+ AlwaysFailFencer.fenceCalled = 0;
+ AlwaysFailFencer.callArgs.clear();
+ }
+
+ @Test
+ public void testSingleFencer() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer(
+ AlwaysSucceedFencer.class.getName() + "(foo)");
+ assertTrue(fencer.fence(new InetSocketAddress("host", 1234)));
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc);
+ assertEquals("foo", AlwaysSucceedFencer.callArgs.get(0));
+ }
+
+ @Test
+ public void testMultipleFencers() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer(
+ AlwaysSucceedFencer.class.getName() + "(foo)\n" +
+ AlwaysSucceedFencer.class.getName() + "(bar)\n");
+ assertTrue(fencer.fence(new InetSocketAddress("host", 1234)));
+ // Only one call, since the first fencer succeeds
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("foo", AlwaysSucceedFencer.callArgs.get(0));
+ }
+
+ @Test
+ public void testWhitespaceAndCommentsInConfig()
+ throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer(
+ "\n" +
+ " # the next one will always fail\n" +
+ " " + AlwaysFailFencer.class.getName() + "(foo) # <- fails\n" +
+ AlwaysSucceedFencer.class.getName() + "(bar) \n");
+ assertTrue(fencer.fence(new InetSocketAddress("host", 1234)));
+ // One call to each, since top fencer fails
+ assertEquals(1, AlwaysFailFencer.fenceCalled);
+ assertEquals("host:1234", AlwaysFailFencer.fencedSvc);
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc);
+ assertEquals("foo", AlwaysFailFencer.callArgs.get(0));
+ assertEquals("bar", AlwaysSucceedFencer.callArgs.get(0));
+ }
+
+ @Test
+ public void testArglessFencer() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer(
+ AlwaysSucceedFencer.class.getName());
+ assertTrue(fencer.fence(new InetSocketAddress("host", 1234)));
+ // One call to each, since top fencer fails
+ assertEquals(1, AlwaysSucceedFencer.fenceCalled);
+ assertEquals("host:1234", AlwaysSucceedFencer.fencedSvc);
+ assertEquals(null, AlwaysSucceedFencer.callArgs.get(0));
+ }
+
+ @Test
+ public void testShortNameShell() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer("shell(true)");
+ assertTrue(fencer.fence(new InetSocketAddress("host", 1234)));
+ }
+
+ @Test
+ public void testShortNameSsh() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer("sshfence");
+ assertFalse(fencer.fence(new InetSocketAddress("host", 1234)));
+ }
+
+ @Test
+ public void testShortNameSshWithUser() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer("sshfence(user)");
+ assertFalse(fencer.fence(new InetSocketAddress("host", 1234)));
+ }
+
+ @Test
+ public void testShortNameSshWithPort() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer("sshfence(:123)");
+ assertFalse(fencer.fence(new InetSocketAddress("host", 1234)));
+ }
+
+ @Test
+ public void testShortNameSshWithUserPort() throws BadFencingConfigurationException {
+ NodeFencer fencer = setupFencer("sshfence(user:123)");
+ assertFalse(fencer.fence(new InetSocketAddress("host", 1234)));
+ }
+
+ public static NodeFencer setupFencer(String confStr)
+ throws BadFencingConfigurationException {
+ System.err.println("Testing configuration:\n" + confStr);
+ Configuration conf = new Configuration();
+ conf.set(NodeFencer.CONF_METHODS_KEY, confStr);
+ return new NodeFencer(conf);
+ }
+
+ /**
+ * Mock fencing method that always returns true
+ */
+ public static class AlwaysSucceedFencer extends Configured
+ implements FenceMethod {
+ static int fenceCalled = 0;
+ static String fencedSvc;
+ static List<String> callArgs = Lists.newArrayList();
+
+ @Override
+ public boolean tryFence(InetSocketAddress serviceAddr, String args) {
+ fencedSvc = serviceAddr.getHostName() + ":" + serviceAddr.getPort();
+ callArgs.add(args);
+ fenceCalled++;
+ return true;
+ }
+
+ @Override
+ public void checkArgs(String args) {
+ }
+ }
+
+ /**
+ * Identical mock to above, except always returns false
+ */
+ public static class AlwaysFailFencer extends Configured
+ implements FenceMethod {
+ static int fenceCalled = 0;
+ static String fencedSvc;
+ static List<String> callArgs = Lists.newArrayList();
+
+ @Override
+ public boolean tryFence(InetSocketAddress serviceAddr, String args) {
+ fencedSvc = serviceAddr.getHostName() + ":" + serviceAddr.getPort();
+ callArgs.add(args);
+ fenceCalled++;
+ return false;
+ }
+
+ @Override
+ public void checkArgs(String args) {
+ }
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java
new file mode 100644
index 0000000..49bae03
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import static org.junit.Assert.*;
+
+import java.net.InetSocketAddress;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import static org.mockito.Mockito.spy;
+
+public class TestShellCommandFencer {
+ private ShellCommandFencer fencer = createFencer();
+
+ @BeforeClass
+ public static void setupLogSpy() {
+ ShellCommandFencer.LOG = spy(ShellCommandFencer.LOG);
+ }
+
+ @Before
+ public void resetLogSpy() {
+ Mockito.reset(ShellCommandFencer.LOG);
+ }
+
+ private static ShellCommandFencer createFencer() {
+ Configuration conf = new Configuration();
+ conf.set("in.fencing.tests", "yessir");
+ ShellCommandFencer fencer = new ShellCommandFencer();
+ fencer.setConf(conf);
+ return fencer;
+ }
+
+ /**
+ * Test that the exit code of the script determines
+ * whether the fencer succeeded or failed
+ */
+ @Test
+ public void testBasicSuccessFailure() {
+ InetSocketAddress addr = new InetSocketAddress("host", 1234);
+ assertTrue(fencer.tryFence(addr, "echo"));
+ assertFalse(fencer.tryFence(addr, "exit 1"));
+ // bad path should also fail
+ assertFalse(fencer.tryFence(addr, "xxxxxxxxxxxx"));
+ }
+
+ @Test
+ public void testCheckNoArgs() {
+ try {
+ Configuration conf = new Configuration();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell");
+ new NodeFencer(conf);
+ fail("Didn't throw when passing no args to shell");
+ } catch (BadFencingConfigurationException confe) {
+ assertTrue(
+ "Unexpected exception:" + StringUtils.stringifyException(confe),
+ confe.getMessage().contains("No argument passed"));
+ }
+ }
+
+ @Test
+ public void testCheckParensNoArgs() {
+ try {
+ Configuration conf = new Configuration();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell()");
+ new NodeFencer(conf);
+ fail("Didn't throw when passing no args to shell");
+ } catch (BadFencingConfigurationException confe) {
+ assertTrue(
+ "Unexpected exception:" + StringUtils.stringifyException(confe),
+ confe.getMessage().contains("Unable to parse line: 'shell()'"));
+ }
+ }
+
+ /**
+ * Test that lines on stdout get passed as INFO
+ * level messages
+ */
+ @Test
+ public void testStdoutLogging() {
+ InetSocketAddress addr = new InetSocketAddress("host", 1234);
+ assertTrue(fencer.tryFence(addr, "echo hello"));
+ Mockito.verify(ShellCommandFencer.LOG).info(
+ Mockito.endsWith("echo hello: host:1234 hello"));
+ }
+
+ /**
+ * Test that lines on stderr get passed as
+ * WARN level log messages
+ */
+ @Test
+ public void testStderrLogging() {
+ InetSocketAddress addr = new InetSocketAddress("host", 1234);
+ assertTrue(fencer.tryFence(addr, "echo hello >&2"));
+ Mockito.verify(ShellCommandFencer.LOG).warn(
+ Mockito.endsWith("echo hello >&2: host:1234 hello"));
+ }
+
+ /**
+ * Verify that the Configuration gets passed as
+ * environment variables to the fencer.
+ */
+ @Test
+ public void testConfAsEnvironment() {
+ InetSocketAddress addr = new InetSocketAddress("host", 1234);
+ fencer.tryFence(addr, "echo $in_fencing_tests");
+ Mockito.verify(ShellCommandFencer.LOG).info(
+ Mockito.endsWith("echo $in...ing_tests: host:1234 yessir"));
+ }
+
+ /**
+ * Test that we properly close off our input to the subprocess
+ * such that it knows there's no tty connected. This is important
+ * so that, if we use 'ssh', it won't try to prompt for a password
+ * and block forever, for example.
+ */
+ @Test(timeout=10000)
+ public void testSubprocessInputIsClosed() {
+ InetSocketAddress addr = new InetSocketAddress("host", 1234);
+ assertFalse(fencer.tryFence(addr, "read"));
+ }
+
+ @Test
+ public void testCommandAbbreviation() {
+ assertEquals("a...f", ShellCommandFencer.abbreviate("abcdef", 5));
+ assertEquals("abcdef", ShellCommandFencer.abbreviate("abcdef", 6));
+ assertEquals("abcdef", ShellCommandFencer.abbreviate("abcdef", 7));
+
+ assertEquals("a...g", ShellCommandFencer.abbreviate("abcdefg", 5));
+ assertEquals("a...h", ShellCommandFencer.abbreviate("abcdefgh", 5));
+ assertEquals("a...gh", ShellCommandFencer.abbreviate("abcdefgh", 6));
+ assertEquals("ab...gh", ShellCommandFencer.abbreviate("abcdefgh", 7));
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java
new file mode 100644
index 0000000..f89df6a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestSshFenceByTcpPort.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ha;
+
+import static org.junit.Assert.*;
+
+import java.net.InetSocketAddress;
+
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.SshFenceByTcpPort.Args;
+import org.apache.log4j.Level;
+import org.junit.Assume;
+import org.junit.Test;
+
+public class TestSshFenceByTcpPort {
+
+ static {
+ ((Log4JLogger)SshFenceByTcpPort.LOG).getLogger().setLevel(Level.ALL);
+ }
+
+ private String TEST_FENCING_HOST = System.getProperty(
+ "test.TestSshFenceByTcpPort.host", "localhost");
+ private String TEST_FENCING_PORT = System.getProperty(
+ "test.TestSshFenceByTcpPort.port", "8020");
+ private final String TEST_KEYFILE = System.getProperty(
+ "test.TestSshFenceByTcpPort.key");
+
+ @Test(timeout=20000)
+ public void testFence() throws BadFencingConfigurationException {
+ Assume.assumeTrue(isConfigured());
+ Configuration conf = new Configuration();
+ conf.set(SshFenceByTcpPort.CONF_IDENTITIES_KEY, TEST_KEYFILE);
+ SshFenceByTcpPort fence = new SshFenceByTcpPort();
+ fence.setConf(conf);
+ assertTrue(fence.tryFence(
+ new InetSocketAddress(TEST_FENCING_HOST,
+ Integer.valueOf(TEST_FENCING_PORT)),
+ null));
+ }
+
+ /**
+ * Test connecting to a host which definitely won't respond.
+ * Make sure that it times out and returns false, but doesn't throw
+ * any exception
+ */
+ @Test(timeout=20000)
+ public void testConnectTimeout() throws BadFencingConfigurationException {
+ Configuration conf = new Configuration();
+ conf.setInt(SshFenceByTcpPort.CONF_CONNECT_TIMEOUT_KEY, 3000);
+ SshFenceByTcpPort fence = new SshFenceByTcpPort();
+ fence.setConf(conf);
+ // Connect to Google's DNS server - not running ssh!
+ assertFalse(fence.tryFence(new InetSocketAddress("8.8.8.8", 1234), ""));
+ }
+
+ @Test
+ public void testArgsParsing() throws BadFencingConfigurationException {
+ InetSocketAddress addr = new InetSocketAddress("bar.com", 1234);
+
+ Args args = new SshFenceByTcpPort.Args(addr, null);
+ assertEquals("bar.com", args.host);
+ assertEquals(1234, args.targetPort);
+ assertEquals(System.getProperty("user.name"), args.user);
+ assertEquals(22, args.sshPort);
+
+ args = new SshFenceByTcpPort.Args(addr, "");
+ assertEquals("bar.com", args.host);
+ assertEquals(1234, args.targetPort);
+ assertEquals(System.getProperty("user.name"), args.user);
+ assertEquals(22, args.sshPort);
+
+ args = new SshFenceByTcpPort.Args(addr, "12345");
+ assertEquals("bar.com", args.host);
+ assertEquals(1234, args.targetPort);
+ assertEquals("12345", args.user);
+ assertEquals(22, args.sshPort);
+
+ args = new SshFenceByTcpPort.Args(addr, ":12345");
+ assertEquals("bar.com", args.host);
+ assertEquals(1234, args.targetPort);
+ assertEquals(System.getProperty("user.name"), args.user);
+ assertEquals(12345, args.sshPort);
+
+ args = new SshFenceByTcpPort.Args(addr, "foo:8020");
+ assertEquals("bar.com", args.host);
+ assertEquals(1234, args.targetPort);
+ assertEquals("foo", args.user);
+ assertEquals(8020, args.sshPort);
+ }
+
+ @Test
+ public void testBadArgsParsing() throws BadFencingConfigurationException {
+ assertBadArgs(":"); // No port specified
+ assertBadArgs("bar.com:"); // "
+ assertBadArgs(":xx"); // Port does not parse
+ assertBadArgs("bar.com:xx"); // "
+ }
+
+ private void assertBadArgs(String argStr) {
+ InetSocketAddress addr = new InetSocketAddress("bar.com", 1234);
+ try {
+ new Args(addr, argStr);
+ fail("Did not fail on bad args: " + argStr);
+ } catch (BadFencingConfigurationException e) {
+ // Expected
+ }
+ }
+
+ private boolean isConfigured() {
+ return (TEST_FENCING_HOST != null && !TEST_FENCING_HOST.isEmpty()) &&
+ (TEST_FENCING_PORT != null && !TEST_FENCING_PORT.isEmpty()) &&
+ (TEST_KEYFILE != null && !TEST_KEYFILE.isEmpty());
+ }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java
index eec4797..4949ef3 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestFailoverProxy.java
@@ -25,21 +25,23 @@
import org.apache.hadoop.io.retry.UnreliableImplementation.TypeOfExceptionToFailWith;
import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException;
import org.apache.hadoop.ipc.StandbyException;
+import org.apache.hadoop.util.ThreadUtil;
import org.junit.Test;
+@SuppressWarnings("unchecked")
public class TestFailoverProxy {
- public static class FlipFlopProxyProvider implements FailoverProxyProvider {
+ public static class FlipFlopProxyProvider<T> implements FailoverProxyProvider<T> {
- private Class<?> iface;
- private Object currentlyActive;
- private Object impl1;
- private Object impl2;
+ private Class<T> iface;
+ private T currentlyActive;
+ private T impl1;
+ private T impl2;
private int failoversOccurred = 0;
- public FlipFlopProxyProvider(Class<?> iface, Object activeImpl,
- Object standbyImpl) {
+ public FlipFlopProxyProvider(Class<T> iface, T activeImpl,
+ T standbyImpl) {
this.iface = iface;
this.impl1 = activeImpl;
this.impl2 = standbyImpl;
@@ -47,7 +49,7 @@
}
@Override
- public Object getProxy() {
+ public T getProxy() {
return currentlyActive;
}
@@ -58,7 +60,7 @@
}
@Override
- public Class<?> getInterface() {
+ public Class<T> getInterface() {
return iface;
}
@@ -126,7 +128,7 @@
new FlipFlopProxyProvider(UnreliableInterface.class,
new UnreliableImplementation("impl1"),
new UnreliableImplementation("impl2")),
- RetryPolicies.TRY_ONCE_DONT_FAIL);
+ RetryPolicies.TRY_ONCE_THEN_FAIL);
unreliable.succeedsOnceThenFailsReturningString();
try {
@@ -180,7 +182,7 @@
assertEquals("impl1", unreliable.succeedsOnceThenFailsReturningString());
try {
- assertEquals("impl2", unreliable.succeedsOnceThenFailsReturningString());
+ unreliable.succeedsOnceThenFailsReturningString();
fail("should not have succeeded twice");
} catch (IOException e) {
// Make sure we *don't* fail over since the first implementation threw an
@@ -194,6 +196,27 @@
assertEquals("impl2", unreliable.succeedsOnceThenFailsReturningStringIdempotent());
}
+ /**
+ * Test that if a non-idempotent void function is called, and there is an exception,
+ * the exception is properly propagated
+ */
+ @Test
+ public void testExceptionPropagatedForNonIdempotentVoid() throws Exception {
+ UnreliableInterface unreliable = (UnreliableInterface)RetryProxy
+ .create(UnreliableInterface.class,
+ new FlipFlopProxyProvider(UnreliableInterface.class,
+ new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.IO_EXCEPTION),
+ new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION)),
+ RetryPolicies.failoverOnNetworkException(1));
+
+ try {
+ unreliable.nonIdempotentVoidFailsIfIdentifierDoesntMatch("impl2");
+ fail("did not throw an exception");
+ } catch (Exception e) {
+ }
+
+ }
+
private static class SynchronizedUnreliableImplementation extends UnreliableImplementation {
private CountDownLatch methodLatch;
@@ -267,4 +290,62 @@
assertEquals("impl2", t2.result);
assertEquals(1, proxyProvider.getFailoversOccurred());
}
+
+ /**
+ * Ensure that when all configured services are throwing StandbyException
+ * that we fail over back and forth between them until one is no longer
+ * throwing StandbyException.
+ */
+ @Test
+ public void testFailoverBetweenMultipleStandbys()
+ throws UnreliableException, StandbyException, IOException {
+
+ final long millisToSleep = 10000;
+
+ final UnreliableImplementation impl1 = new UnreliableImplementation("impl1",
+ TypeOfExceptionToFailWith.STANDBY_EXCEPTION);
+ FlipFlopProxyProvider proxyProvider = new FlipFlopProxyProvider(
+ UnreliableInterface.class,
+ impl1,
+ new UnreliableImplementation("impl2",
+ TypeOfExceptionToFailWith.STANDBY_EXCEPTION));
+
+ final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy
+ .create(UnreliableInterface.class, proxyProvider,
+ RetryPolicies.failoverOnNetworkException(
+ RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000));
+
+ new Thread() {
+ @Override
+ public void run() {
+ ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep);
+ impl1.setIdentifier("renamed-impl1");
+ }
+ }.start();
+
+ String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1");
+ assertEquals("renamed-impl1", result);
+ }
+
+ /**
+ * Ensure that normal IO exceptions don't result in a failover.
+ */
+ @Test
+ public void testExpectedIOException() {
+ UnreliableInterface unreliable = (UnreliableInterface)RetryProxy
+ .create(UnreliableInterface.class,
+ new FlipFlopProxyProvider(UnreliableInterface.class,
+ new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.REMOTE_EXCEPTION),
+ new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION)),
+ RetryPolicies.failoverOnNetworkException(
+ RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000));
+
+ try {
+ unreliable.failsIfIdentifierDoesntMatch("no-such-identifier");
+ fail("Should have thrown *some* exception");
+ } catch (Exception e) {
+ assertTrue("Expected IOE but got " + e.getClass(),
+ e instanceof IOException);
+ }
+ }
}
\ No newline at end of file
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java
index c48e87b..696f40d 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestRetryProxy.java
@@ -19,7 +19,6 @@
package org.apache.hadoop.io.retry;
import static org.apache.hadoop.io.retry.RetryPolicies.RETRY_FOREVER;
-import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_DONT_FAIL;
import static org.apache.hadoop.io.retry.RetryPolicies.TRY_ONCE_THEN_FAIL;
import static org.apache.hadoop.io.retry.RetryPolicies.retryByException;
import static org.apache.hadoop.io.retry.RetryPolicies.retryByRemoteException;
@@ -59,19 +58,6 @@
}
}
- public void testTryOnceDontFail() throws UnreliableException {
- UnreliableInterface unreliable = (UnreliableInterface)
- RetryProxy.create(UnreliableInterface.class, unreliableImpl, TRY_ONCE_DONT_FAIL);
- unreliable.alwaysSucceeds();
- unreliable.failsOnceThenSucceeds();
- try {
- unreliable.failsOnceThenSucceedsWithReturnValue();
- fail("Should fail");
- } catch (UnreliableException e) {
- // expected
- }
- }
-
public void testRetryForever() throws UnreliableException {
UnreliableInterface unreliable = (UnreliableInterface)
RetryProxy.create(UnreliableInterface.class, unreliableImpl, RETRY_FOREVER);
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java
index 7fa88b3..54fe677 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableImplementation.java
@@ -19,6 +19,7 @@
import java.io.IOException;
+import org.apache.hadoop.io.retry.UnreliableInterface.UnreliableException;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.StandbyException;
@@ -37,7 +38,8 @@
public static enum TypeOfExceptionToFailWith {
UNRELIABLE_EXCEPTION,
STANDBY_EXCEPTION,
- IO_EXCEPTION
+ IO_EXCEPTION,
+ REMOTE_EXCEPTION
}
public UnreliableImplementation() {
@@ -48,6 +50,10 @@
this(identifier, TypeOfExceptionToFailWith.UNRELIABLE_EXCEPTION);
}
+ public void setIdentifier(String identifier) {
+ this.identifier = identifier;
+ }
+
public UnreliableImplementation(String identifier,
TypeOfExceptionToFailWith exceptionToFailWith) {
this.identifier = identifier;
@@ -91,14 +97,7 @@
if (succeedsOnceThenFailsCount++ < 1) {
return identifier;
} else {
- switch (exceptionToFailWith) {
- case STANDBY_EXCEPTION:
- throw new StandbyException(identifier);
- case UNRELIABLE_EXCEPTION:
- throw new UnreliableException(identifier);
- case IO_EXCEPTION:
- throw new IOException(identifier);
- }
+ throwAppropriateException(exceptionToFailWith, identifier);
return null;
}
}
@@ -109,16 +108,8 @@
if (succeedsTenTimesThenFailsCount++ < 10) {
return identifier;
} else {
- switch (exceptionToFailWith) {
- case STANDBY_EXCEPTION:
- throw new StandbyException(identifier);
- case UNRELIABLE_EXCEPTION:
- throw new UnreliableException(identifier);
- case IO_EXCEPTION:
- throw new IOException(identifier);
- default:
- throw new RuntimeException(identifier);
- }
+ throwAppropriateException(exceptionToFailWith, identifier);
+ return null;
}
}
@@ -128,16 +119,8 @@
if (succeedsOnceThenFailsIdempotentCount++ < 1) {
return identifier;
} else {
- switch (exceptionToFailWith) {
- case STANDBY_EXCEPTION:
- throw new StandbyException(identifier);
- case UNRELIABLE_EXCEPTION:
- throw new UnreliableException(identifier);
- case IO_EXCEPTION:
- throw new IOException(identifier);
- default:
- throw new RuntimeException(identifier);
- }
+ throwAppropriateException(exceptionToFailWith, identifier);
+ return null;
}
}
@@ -147,17 +130,38 @@
if (this.identifier.equals(identifier)) {
return identifier;
} else {
- switch (exceptionToFailWith) {
- case STANDBY_EXCEPTION:
- throw new StandbyException(identifier);
- case UNRELIABLE_EXCEPTION:
- throw new UnreliableException(identifier);
- case IO_EXCEPTION:
- throw new IOException(identifier);
- default:
- throw new RuntimeException(identifier);
- }
+ String message = "expected '" + this.identifier + "' but received '" +
+ identifier + "'";
+ throwAppropriateException(exceptionToFailWith, message);
+ return null;
+ }
+ }
+
+ @Override
+ public void nonIdempotentVoidFailsIfIdentifierDoesntMatch(String identifier)
+ throws UnreliableException, StandbyException, IOException {
+ if (this.identifier.equals(identifier)) {
+ return;
+ } else {
+ String message = "expected '" + this.identifier + "' but received '" +
+ identifier + "'";
+ throwAppropriateException(exceptionToFailWith, message);
}
}
+ private static void throwAppropriateException(TypeOfExceptionToFailWith eType,
+ String message) throws UnreliableException, StandbyException, IOException {
+ switch (eType) {
+ case STANDBY_EXCEPTION:
+ throw new StandbyException(message);
+ case UNRELIABLE_EXCEPTION:
+ throw new UnreliableException(message);
+ case IO_EXCEPTION:
+ throw new IOException(message);
+ case REMOTE_EXCEPTION:
+ throw new RemoteException(IOException.class.getName(), message);
+ default:
+ throw new RuntimeException(message);
+ }
+ }
}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java
index e794c16..66a8b85 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/UnreliableInterface.java
@@ -67,4 +67,7 @@
@Idempotent
public String failsIfIdentifierDoesntMatch(String identifier)
throws UnreliableException, StandbyException, IOException;
+
+ void nonIdempotentVoidFailsIfIdentifierDoesntMatch(String identifier)
+ throws UnreliableException, StandbyException, IOException;
}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java
index 1f3e67a..efb2dc1 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestIPC.java
@@ -20,7 +20,9 @@
import org.apache.commons.logging.*;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
@@ -590,6 +592,38 @@
Server.RECEIVED_HTTP_REQ_RESPONSE.getBytes());
}
+ @Test
+ public void testConnectionRetriesOnSocketTimeoutExceptions() throws Exception {
+ Configuration conf = new Configuration();
+ // set max retries to 0
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ 0);
+ assertRetriesOnSocketTimeouts(conf, 1);
+
+ // set max retries to 3
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ 3);
+ assertRetriesOnSocketTimeouts(conf, 4);
+ }
+
+ private void assertRetriesOnSocketTimeouts(Configuration conf,
+ int maxTimeoutRetries) throws IOException, InterruptedException {
+ SocketFactory mockFactory = Mockito.mock(SocketFactory.class);
+ doThrow(new SocketTimeoutException()).when(mockFactory).createSocket();
+ Client client = new Client(IntWritable.class, conf, mockFactory);
+ InetSocketAddress address = new InetSocketAddress("127.0.0.1", 9090);
+ try {
+ client.call(new IntWritable(RANDOM.nextInt()), address, null, null, 0,
+ conf);
+ fail("Not throwing the SocketTimeoutException");
+ } catch (SocketTimeoutException e) {
+ Mockito.verify(mockFactory, Mockito.times(maxTimeoutRetries))
+ .createSocket();
+ }
+ }
+
private void doIpcVersionTest(
byte[] requestData,
byte[] expectedResponse) throws Exception {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt
new file mode 100644
index 0000000..3e59df7
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt
@@ -0,0 +1,252 @@
+Changes for HDFS-1623 branch.
+
+This change list will be merged into the trunk CHANGES.txt when the HDFS-1623
+branch is merged.
+------------------------------
+
+HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)
+
+HDFS-1974. Introduce active and standy states to the namenode. (suresh)
+
+HDFS-2407. getServerDefaults and getStats don't check operation category (atm)
+
+HDFS-1973. HA: HDFS clients must handle namenode failover and switch over to the new active namenode. (atm)
+
+HDFS-2301. Start/stop appropriate namenode services when transition to active and standby states. (suresh)
+
+HDFS-2231. Configuration changes for HA namenode. (suresh)
+
+HDFS-2418. Change ConfiguredFailoverProxyProvider to take advantage of HDFS-2231. (atm)
+
+HDFS-2393. Mark appropriate methods of ClientProtocol with the idempotent annotation. (atm)
+
+HDFS-2523. Small NN fixes to include HAServiceProtocol and prevent NPE on shutdown. (todd)
+
+HDFS-2577. NN fails to start since it tries to start secret manager in safemode. (todd)
+
+HDFS-2582. Scope dfs.ha.namenodes config by nameservice (todd)
+
+HDFS-2591. MiniDFSCluster support to mix and match federation with HA (todd)
+
+HDFS-1975. Support for sharing the namenode state from active to standby. (jitendra, atm, todd)
+
+HDFS-1971. Send block report from datanode to both active and standby namenodes. (sanjay, todd via suresh)
+
+HDFS-2616. Change DatanodeProtocol#sendHeartbeat() to return HeartbeatResponse. (suresh)
+
+HDFS-2622. Fix TestDFSUpgrade in HA branch. (todd)
+
+HDFS-2612. Handle refreshNameNodes in federated HA clusters (todd)
+
+HDFS-2623. Add test case for hot standby capability (todd)
+
+HDFS-2626. BPOfferService.verifyAndSetNamespaceInfo needs to be synchronized (todd)
+
+HDFS-2624. ConfiguredFailoverProxyProvider doesn't correctly stop ProtocolTranslators (todd)
+
+HDFS-2625. TestDfsOverAvroRpc failing after introduction of HeartbeatResponse type (todd)
+
+HDFS-2627. Determine DN's view of which NN is active based on heartbeat responses (todd)
+
+HDFS-2634. Standby needs to ingest latest edit logs before transitioning to active (todd)
+
+HDFS-2671. NN should throw StandbyException in response to RPCs in STANDBY state (todd)
+
+HDFS-2680. DFSClient should construct failover proxy with exponential backoff (todd)
+
+HDFS-2683. Authority-based lookup of proxy provider fails if path becomes canonicalized (todd)
+
+HDFS-2689. HA: BookKeeperEditLogInputStream doesn't implement isInProgress() (atm)
+
+HDFS-2602. NN should log newly-allocated blocks without losing BlockInfo (atm)
+
+HDFS-2667. Fix transition from active to standby (todd)
+
+HDFS-2684. Fix up some failing unit tests on HA branch (todd)
+
+HDFS-2679. Add interface to query current state to HAServiceProtocol (eli via todd)
+
+HDFS-2677. Web UI should indicate the NN state. (eli via todd)
+
+HDFS-2678. When a FailoverProxyProvider is used, DFSClient should not retry connection ten times before failing over (atm via todd)
+
+HDFS-2682. When a FailoverProxyProvider is used, Client should not retry for 45 times if it is timing out to connect to server. (Uma Maheswara Rao G via todd)
+
+HDFS-2693. Fix synchronization issues around state transition (todd)
+
+HDFS-1972. Fencing mechanism for block invalidations and replications (todd)
+
+HDFS-2714. Fix test cases which use standalone FSNamesystems (todd)
+
+HDFS-2692. Fix bugs related to failover from/into safe mode. (todd)
+
+HDFS-2716. Configuration needs to allow different dfs.http.addresses for each HA NN (todd)
+
+HDFS-2720. Fix MiniDFSCluster HA support to work properly on Windows. (Uma Maheswara Rao G via todd)
+
+HDFS-2291. Allow the StandbyNode to make checkpoints in an HA setup. (todd)
+
+HDFS-2709. Appropriately handle error conditions in EditLogTailer (atm via todd)
+
+HDFS-2730. Refactor shared HA-related test code into HATestUtil class (todd)
+
+HDFS-2762. Fix TestCheckpoint timing out on HA branch. (Uma Maheswara Rao G via todd)
+
+HDFS-2724. NN web UI can throw NPE after startup, before standby state is entered. (todd)
+
+HDFS-2753. Fix standby getting stuck in safemode when blocks are written while SBN is down. (Hari Mankude and todd via todd)
+
+HDFS-2773. Reading edit logs from an earlier version should not leave blocks in under-construction state. (todd)
+
+HDFS-2775. Fix TestStandbyCheckpoints.testBothNodesInStandbyState failing intermittently. (todd)
+
+HDFS-2766. Test for case where standby partially reads log and then performs checkpoint. (atm)
+
+HDFS-2738. FSEditLog.selectinputStreams is reading through in-progress streams even when non-in-progress are requested. (atm)
+
+HDFS-2789. TestHAAdmin.testFailover is failing (eli)
+
+HDFS-2747. Entering safe mode after starting SBN can NPE. (Uma Maheswara Rao G via todd)
+
+HDFS-2772. On transition to active, standby should not swallow ELIE. (atm)
+
+HDFS-2767. ConfiguredFailoverProxyProvider should support NameNodeProtocol. (Uma Maheswara Rao G via todd)
+
+HDFS-2795. Standby NN takes a long time to recover from a dead DN starting up. (todd)
+
+HDFS-2592. Balancer support for HA namenodes. (Uma Maheswara Rao G via todd)
+
+HDFS-2367. Enable the configuration of multiple HA cluster addresses. (atm)
+
+HDFS-2812. When becoming active, the NN should treat all leases as freshly renewed. (todd)
+
+HDFS-2737. Automatically trigger log rolls periodically on the active NN. (todd and atm)
+
+HDFS-2820. Add a simple sanity check for HA config (todd)
+
+HDFS-2688. Add tests for quota tracking in an HA cluster. (todd)
+
+HDFS-2804. Should not mark blocks under-replicated when exiting safemode (todd)
+
+HDFS-2807. Service level authorizartion for HAServiceProtocol. (jitendra)
+
+HDFS-2809. Add test to verify that delegation tokens are honored after failover. (jitendra and atm)
+
+HDFS-2838. NPE in FSNamesystem when in safe mode. (Gregory Chanan via eli)
+
+HDFS-2805. Add a test for a federated cluster with HA NNs. (Brandon Li via jitendra)
+
+HDFS-2841. HAAdmin does not work if security is enabled. (atm)
+
+HDFS-2691. Fixes for pipeline recovery in an HA cluster: report RBW replicas immediately upon pipeline creation. (todd)
+
+HDFS-2824. Fix failover when prior NN died just after creating an edit log segment. (atm via todd)
+
+HDFS-2853. HA: NN fails to start if the shared edits dir is marked required (atm via eli)
+
+HDFS-2845. SBN should not allow browsing of the file system via web UI. (Bikas Saha via atm)
+
+HDFS-2742. HA: observed dataloss in replication stress test. (todd via eli)
+
+HDFS-2870. Fix log level for block debug info in processMisReplicatedBlocks (todd)
+
+HDFS-2859. LOCAL_ADDRESS_MATCHER.match has NPE when called from DFSUtil.getSuffixIDs when the host is incorrect (Bikas Saha via todd)
+
+HDFS-2861. checkpointing should verify that the dfs.http.address has been configured to a non-loopback for peer NN (todd)
+
+HDFS-2860. TestDFSRollback#testRollback is failing. (atm)
+
+HDFS-2769. HA: When HA is enabled with a shared edits dir, that dir should be
+marked required. (atm via eli)
+
+HDFS-2863. Failures observed if dfs.edits.dir and shared.edits.dir have same directories. (Bikas Saha via atm)
+
+HDFS-2874. Edit log should log to shared dirs before local dirs. (todd)
+
+HDFS-2890. DFSUtil#getSuffixIDs should skip unset configurations. (atm)
+
+HDFS-2792. Make fsck work. (atm)
+
+HDFS-2808. HA: haadmin should use namenode ids. (eli)
+
+HDFS-2819. Document new HA-related configs in hdfs-default.xml. (eli)
+
+HDFS-2752. HA: exit if multiple shared dirs are configured. (eli)
+
+HDFS-2894. HA: automatically determine the nameservice Id if only one nameservice is configured. (eli)
+
+HDFS-2733. Document HA configuration and CLI. (atm)
+
+HDFS-2794. Active NN may purge edit log files before standby NN has a chance to read them (todd)
+
+HDFS-2901. Improvements for SBN web UI - not show under-replicated/missing blocks. (Brandon Li via jitendra)
+
+HDFS-2905. HA: Standby NN NPE when shared edits dir is deleted. (Bikas Saha via jitendra)
+
+HDFS-2579. Starting delegation token manager during safemode fails. (todd)
+
+HDFS-2510. Add HA-related metrics. (atm)
+
+HDFS-2924. Standby checkpointing fails to authenticate in secure cluster. (todd)
+
+HDFS-2915. HA: TestFailureOfSharedDir.testFailureOfSharedDir() has race condition. (Bikas Saha via jitendra)
+
+HDFS-2912. Namenode not shutting down when shared edits dir is inaccessible. (Bikas Saha via atm)
+
+HDFS-2917. HA: haadmin should not work if run by regular user (eli)
+
+HDFS-2939. TestHAStateTransitions fails on Windows. (Uma Maheswara Rao G via atm)
+
+HDFS-2947. On startup NN throws an NPE in the metrics system. (atm)
+
+HDFS-2942. TestActiveStandbyElectorRealZK fails if build dir does not exist. (atm)
+
+HDFS-2948. NN throws NPE during shutdown if it fails to startup (todd)
+
+HDFS-2909. HA: Inaccessible shared edits dir not getting removed from FSImage storage dirs upon error. (Bikas Saha via jitendra)
+
+HDFS-2934. Allow configs to be scoped to all NNs in the nameservice. (todd)
+
+HDFS-2935. Shared edits dir property should be suffixed with nameservice and namenodeID (todd)
+
+HDFS-2928. ConfiguredFailoverProxyProvider should not create a NameNode proxy with an underlying retry proxy. (Uma Maheswara Rao G via atm)
+
+HDFS-2955. IllegalStateException during standby startup in getCurSegmentTxId. (Hari Mankude via atm)
+
+HDFS-2937. TestDFSHAAdmin needs tests with MiniDFSCluster. (Brandon Li via suresh)
+
+HDFS-2586. Add protobuf service and implementation for HAServiceProtocol. (suresh via atm)
+
+HDFS-2952. NN should not start with upgrade option or with a pending an unfinalized upgrade. (atm)
+
+HDFS-2974. MiniDFSCluster does not delete standby NN name dirs during format. (atm)
+
+HDFS-2929. Stress test and fixes for block synchronization (todd)
+
+HDFS-2972. Small optimization building incremental block report (todd)
+
+HDFS-2973. Re-enable NO_ACK optimization for block deletion. (todd)
+
+HDFS-2922. HA: close out operation categories (eli)
+
+HDFS-2993. HA: BackupNode#checkOperation should permit CHECKPOINT operations (eli)
+
+HDFS-2904. Client support for getting delegation tokens. (todd)
+
+HDFS-3013. HA: NameNode format doesn't pick up dfs.namenode.name.dir.NameServiceId configuration (Mingjie Lai via todd)
+
+HDFS-3019. Fix silent failure of TestEditLogJournalFailures (todd)
+
+HDFS-2958. Sweep for remaining proxy construction which doesn't go through failover path. (atm)
+
+HDFS-2920. fix remaining TODO items. (atm and todd)
+
+HDFS-3027. Implement a simple NN health check. (atm)
+
+HDFS-3023. Optimize entries in edits log for persistBlocks call. (todd)
+
+HDFS-2979. Balancer should use logical uri for creating failover proxy with HA enabled. (atm)
+
+HDFS-3035. Fix failure of TestFileAppendRestart due to OP_UPDATE_BLOCKS (todd)
+
+HDFS-3039. Address findbugs and javadoc warnings on branch. (todd via atm)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml b/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml
index 709e52f..301d302 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/dev-support/findbugsExcludeFile.xml
@@ -256,4 +256,12 @@
<Field name="metrics" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
+ <!--
+ This method isn't performance-critical and is much clearer to write as it's written.
+ -->
+ <Match>
+ <Class name="org.apache.hadoop.hdfs.server.datanode.BlockPoolManager" />
+ <Method name="doRefreshNamenodes" />
+ <Bug category="PERFORMANCE" />
+ </Match>
</FindBugsFilter>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/pom.xml b/hadoop-hdfs-project/hadoop-hdfs/pom.xml
index 0b4da80..3f85de0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/pom.xml
@@ -387,6 +387,7 @@
<configuration>
<excludes>
<exclude>CHANGES.txt</exclude>
+ <exclude>CHANGES.HDFS-1623.txt</exclude>
<exclude>.idea/**</exclude>
<exclude>src/main/conf/*</exclude>
<exclude>src/main/docs/**</exclude>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java
index 707182e..636471a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogInputStream.java
@@ -129,6 +129,12 @@
return null;
}
+ // TODO(HA): Test this.
+ @Override
+ public boolean isInProgress() {
+ return true;
+ }
+
/**
* Input stream implementation which can be used by
* FSEditLogOp.Reader
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java
index 7fa9026..047efd5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperJournalManager.java
@@ -312,8 +312,10 @@
}
}
+ // TODO(HA): Handle inProgressOk
@Override
- public EditLogInputStream getInputStream(long fromTxnId) throws IOException {
+ public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk)
+ throws IOException {
for (EditLogLedgerMetadata l : getLedgerList()) {
if (l.getFirstTxId() == fromTxnId) {
try {
@@ -329,8 +331,10 @@
throw new IOException("No ledger for fromTxnId " + fromTxnId + " found.");
}
+ // TODO(HA): Handle inProgressOk
@Override
- public long getNumberOfTransactions(long fromTxnId) throws IOException {
+ public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk)
+ throws IOException {
long count = 0;
long expectedStart = 0;
for (EditLogLedgerMetadata l : getLedgerList()) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java
index b949bc2..5937fa8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/test/java/org/apache/hadoop/contrib/bkjournal/TestBookKeeperJournalManager.java
@@ -195,7 +195,7 @@
out.close();
bkjm.finalizeLogSegment(1, 100);
- long numTrans = bkjm.getNumberOfTransactions(1);
+ long numTrans = bkjm.getNumberOfTransactions(1, true);
assertEquals(100, numTrans);
}
@@ -218,17 +218,17 @@
}
zkc.delete(bkjm.finalizedLedgerZNode(DEFAULT_SEGMENT_SIZE+1, DEFAULT_SEGMENT_SIZE*2), -1);
- long numTrans = bkjm.getNumberOfTransactions(1);
+ long numTrans = bkjm.getNumberOfTransactions(1, true);
assertEquals(DEFAULT_SEGMENT_SIZE, numTrans);
try {
- numTrans = bkjm.getNumberOfTransactions(DEFAULT_SEGMENT_SIZE+1);
+ numTrans = bkjm.getNumberOfTransactions(DEFAULT_SEGMENT_SIZE+1, true);
fail("Should have thrown corruption exception by this point");
} catch (JournalManager.CorruptionException ce) {
// if we get here, everything is going good
}
- numTrans = bkjm.getNumberOfTransactions((DEFAULT_SEGMENT_SIZE*2)+1);
+ numTrans = bkjm.getNumberOfTransactions((DEFAULT_SEGMENT_SIZE*2)+1, true);
assertEquals(DEFAULT_SEGMENT_SIZE, numTrans);
}
@@ -262,7 +262,7 @@
out.abort();
out.close();
- long numTrans = bkjm.getNumberOfTransactions(1);
+ long numTrans = bkjm.getNumberOfTransactions(1, true);
assertEquals((txid-1), numTrans);
}
@@ -357,7 +357,7 @@
bkjm.finalizeLogSegment(1, numTransactions);
- EditLogInputStream in = bkjm.getInputStream(1);
+ EditLogInputStream in = bkjm.getInputStream(1, true);
try {
assertEquals(numTransactions,
FSEditLogTestUtil.countTransactionsInStream(in));
@@ -392,4 +392,4 @@
assertNotNull(zkc.exists(bkjm.finalizedLedgerZNode(1, 100), false));
assertNull(zkc.exists(bkjm.inprogressZNode(), false));
}
-}
\ No newline at end of file
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
index 4c56bb3f..a01c939 100755
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/bin/hdfs
@@ -32,6 +32,7 @@
echo " namenode run the DFS namenode"
echo " datanode run a DFS datanode"
echo " dfsadmin run a DFS admin client"
+ echo " haadmin run a DFS HA admin client"
echo " fsck run a DFS filesystem checking utility"
echo " balancer run a cluster balancing utility"
echo " jmxget get JMX exported values from NameNode or DataNode."
@@ -86,6 +87,10 @@
elif [ "$COMMAND" = "dfsadmin" ] ; then
CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
+elif [ "$COMMAND" = "haadmin" ] ; then
+ CLASS=org.apache.hadoop.hdfs.tools.DFSHAAdmin
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
elif [ "$COMMAND" = "fsck" ] ; then
CLASS=org.apache.hadoop.hdfs.tools.DFSck
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java
index 5a45f51..82d0c36 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/fs/Hdfs.java
@@ -80,8 +80,7 @@
throw new IOException("Incomplete HDFS URI, no host: " + theUri);
}
- InetSocketAddress namenode = NameNode.getAddress(theUri.getAuthority());
- this.dfs = new DFSClient(namenode, conf, getStatistics());
+ this.dfs = new DFSClient(theUri, conf, getStatistics());
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
index 359fd47..88b36b7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
@@ -1,4 +1,3 @@
-
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
@@ -30,6 +29,8 @@
import java.net.NetworkInterface;
import java.net.Socket;
import java.net.SocketException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
@@ -60,6 +61,7 @@
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.fs.permission.FsPermission;
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.CorruptFileBlocks;
import org.apache.hadoop.hdfs.protocol.DSQuotaExceededException;
@@ -83,6 +85,7 @@
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpBlockChecksumResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
+import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
@@ -105,7 +108,8 @@
import org.apache.hadoop.security.token.TokenRenewer;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
+
+import com.google.common.base.Preconditions;
/********************************************************
* DFSClient can connect to a Hadoop Filesystem and
@@ -124,7 +128,9 @@
public static final long SERVER_DEFAULTS_VALIDITY_PERIOD = 60 * 60 * 1000L; // 1 hour
static final int TCP_WINDOW_SIZE = 128 * 1024; // 128 KB
final ClientProtocol namenode;
- private final InetSocketAddress nnAddress;
+ /* The service used for delegation tokens */
+ private Text dtService;
+
final UserGroupInformation ugi;
volatile boolean clientRunning = true;
private volatile FsServerDefaults serverDefaults;
@@ -143,6 +149,9 @@
* DFSClient configuration
*/
static class Conf {
+ final int maxFailoverAttempts;
+ final int failoverSleepBaseMillis;
+ final int failoverSleepMaxMillis;
final int maxBlockAcquireFailures;
final int confTime;
final int ioBufferSize;
@@ -164,6 +173,16 @@
final boolean useLegacyBlockReader;
Conf(Configuration conf) {
+ maxFailoverAttempts = conf.getInt(
+ DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY,
+ DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT);
+ failoverSleepBaseMillis = conf.getInt(
+ DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_KEY,
+ DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_DEFAULT);
+ failoverSleepMaxMillis = conf.getInt(
+ DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY,
+ DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_DEFAULT);
+
maxBlockAcquireFailures = conf.getInt(
DFS_CLIENT_MAX_BLOCK_ACQUIRE_FAILURES_KEY,
DFS_CLIENT_MAX_BLOCK_ACQUIRE_FAILURES_DEFAULT);
@@ -236,6 +255,7 @@
*/
private final Map<String, DFSOutputStream> filesBeingWritten
= new HashMap<String, DFSOutputStream>();
+
private boolean shortCircuitLocalReads;
/**
@@ -247,59 +267,69 @@
public DFSClient(Configuration conf) throws IOException {
this(NameNode.getAddress(conf), conf);
}
+
+ public DFSClient(InetSocketAddress address, Configuration conf) throws IOException {
+ this(NameNode.getUri(address), conf);
+ }
/**
- * Same as this(nameNodeAddr, conf, null);
+ * Same as this(nameNodeUri, conf, null);
* @see #DFSClient(InetSocketAddress, Configuration, org.apache.hadoop.fs.FileSystem.Statistics)
*/
- public DFSClient(InetSocketAddress nameNodeAddr, Configuration conf
+ public DFSClient(URI nameNodeUri, Configuration conf
) throws IOException {
- this(nameNodeAddr, conf, null);
+ this(nameNodeUri, conf, null);
}
/**
- * Same as this(nameNodeAddr, null, conf, stats);
+ * Same as this(nameNodeUri, null, conf, stats);
* @see #DFSClient(InetSocketAddress, ClientProtocol, Configuration, org.apache.hadoop.fs.FileSystem.Statistics)
*/
- public DFSClient(InetSocketAddress nameNodeAddr, Configuration conf,
+ public DFSClient(URI nameNodeUri, Configuration conf,
FileSystem.Statistics stats)
throws IOException {
- this(nameNodeAddr, null, conf, stats);
+ this(nameNodeUri, null, conf, stats);
}
-
+
/**
- * Create a new DFSClient connected to the given nameNodeAddr or rpcNamenode.
- * Exactly one of nameNodeAddr or rpcNamenode must be null.
+ * Create a new DFSClient connected to the given nameNodeUri or rpcNamenode.
+ * Exactly one of nameNodeUri or rpcNamenode must be null.
*/
- DFSClient(InetSocketAddress nameNodeAddr, ClientProtocol rpcNamenode,
+ DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode,
Configuration conf, FileSystem.Statistics stats)
throws IOException {
// Copy only the required DFSClient configuration
this.dfsClientConf = new Conf(conf);
this.conf = conf;
this.stats = stats;
- this.nnAddress = nameNodeAddr;
this.socketFactory = NetUtils.getSocketFactory(conf, ClientProtocol.class);
this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
// The hdfsTimeout is currently the same as the ipc timeout
this.hdfsTimeout = Client.getTimeout(conf);
this.ugi = UserGroupInformation.getCurrentUser();
- final String authority = nameNodeAddr == null? "null":
- nameNodeAddr.getHostName() + ":" + nameNodeAddr.getPort();
+
+ final String authority = nameNodeUri == null? "null": nameNodeUri.getAuthority();
this.leaserenewer = LeaseRenewer.getInstance(authority, ugi, this);
this.clientName = leaserenewer.getClientName(dfsClientConf.taskId);
+
this.socketCache = new SocketCache(dfsClientConf.socketCacheCapacity);
- if (nameNodeAddr != null && rpcNamenode == null) {
- this.namenode = DFSUtil.createNamenode(nameNodeAddr, conf, ugi);
- } else if (nameNodeAddr == null && rpcNamenode != null) {
- //This case is used for testing.
+
+
+ if (rpcNamenode != null) {
+ // This case is used for testing.
+ Preconditions.checkArgument(nameNodeUri == null);
this.namenode = rpcNamenode;
+ dtService = null;
} else {
- throw new IllegalArgumentException(
- "Expecting exactly one of nameNodeAddr and rpcNamenode being null: "
- + "nameNodeAddr=" + nameNodeAddr + ", rpcNamenode=" + rpcNamenode);
+ Preconditions.checkArgument(nameNodeUri != null,
+ "null URI");
+ NameNodeProxies.ProxyAndInfo<ClientProtocol> proxyInfo =
+ NameNodeProxies.createProxy(conf, nameNodeUri, ClientProtocol.class);
+ this.dtService = proxyInfo.getDelegationTokenService();
+ this.namenode = proxyInfo.getProxy();
}
+
// read directly from the block file if configured.
this.shortCircuitLocalReads = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
@@ -388,20 +418,8 @@
/**
* Close connections the Namenode.
- * The namenode variable is either a rpcProxy passed by a test or
- * created using the protocolTranslator which is closeable.
- * If closeable then call close, else close using RPC.stopProxy().
*/
void closeConnectionToNamenode() {
- if (namenode instanceof Closeable) {
- try {
- ((Closeable) namenode).close();
- return;
- } catch (IOException e) {
- // fall through - lets try the stopProxy
- LOG.warn("Exception closing namenode, stopping the proxy");
- }
- }
RPC.stopProxy(namenode);
}
@@ -491,11 +509,13 @@
*/
public Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
throws IOException {
- Token<DelegationTokenIdentifier> result =
+ assert dtService != null;
+ Token<DelegationTokenIdentifier> token =
namenode.getDelegationToken(renewer);
- SecurityUtil.setTokenService(result, nnAddress);
- LOG.info("Created " + DelegationTokenIdentifier.stringifyToken(result));
- return result;
+ token.setService(this.dtService);
+
+ LOG.info("Created " + DelegationTokenIdentifier.stringifyToken(token));
+ return token;
}
/**
@@ -625,13 +645,8 @@
@Override
public long renew(Token<?> token, Configuration conf) throws IOException {
Token<DelegationTokenIdentifier> delToken =
- (Token<DelegationTokenIdentifier>) token;
- LOG.info("Renewing " +
- DelegationTokenIdentifier.stringifyToken(delToken));
- ClientProtocol nn =
- DFSUtil.createNamenode
- (SecurityUtil.getTokenServiceAddr(delToken),
- conf, UserGroupInformation.getCurrentUser());
+ (Token<DelegationTokenIdentifier>) token;
+ ClientProtocol nn = getNNProxy(delToken, conf);
try {
return nn.renewDelegationToken(delToken);
} catch (RemoteException re) {
@@ -647,9 +662,7 @@
(Token<DelegationTokenIdentifier>) token;
LOG.info("Cancelling " +
DelegationTokenIdentifier.stringifyToken(delToken));
- ClientProtocol nn = DFSUtil.createNamenode(
- SecurityUtil.getTokenServiceAddr(delToken), conf,
- UserGroupInformation.getCurrentUser());
+ ClientProtocol nn = getNNProxy(delToken, conf);
try {
nn.cancelDelegationToken(delToken);
} catch (RemoteException re) {
@@ -657,6 +670,31 @@
AccessControlException.class);
}
}
+
+ private static ClientProtocol getNNProxy(
+ Token<DelegationTokenIdentifier> token, Configuration conf)
+ throws IOException {
+ URI uri = HAUtil.getServiceUriFromToken(token);
+ if (HAUtil.isTokenForLogicalUri(token) &&
+ !HAUtil.isLogicalUri(conf, uri)) {
+ // If the token is for a logical nameservice, but the configuration
+ // we have disagrees about that, we can't actually renew it.
+ // This can be the case in MR, for example, if the RM doesn't
+ // have all of the HA clusters configured in its configuration.
+ throw new IOException("Unable to map logical nameservice URI '" +
+ uri + "' to a NameNode. Local configuration does not have " +
+ "a failover proxy provider configured.");
+ }
+
+ NameNodeProxies.ProxyAndInfo<ClientProtocol> info =
+ NameNodeProxies.createProxy(conf, uri, ClientProtocol.class);
+ assert info.getDelegationTokenService().equals(token.getService()) :
+ "Returned service '" + info.getDelegationTokenService().toString() +
+ "' doesn't match expected service '" +
+ token.getService().toString() + "'";
+
+ return info.getProxy();
+ }
@Override
public boolean isManaged(Token<?> token) throws IOException {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
index fef6d8b9..4187f1c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@@ -48,6 +48,19 @@
public static final String DFS_CLIENT_WRITE_REPLACE_DATANODE_ON_FAILURE_POLICY_DEFAULT = "DEFAULT";
public static final String DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY = "dfs.client.socketcache.capacity";
public static final int DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT = 16;
+
+ // HA related configuration
+ public static final String DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX = "dfs.client.failover.proxy.provider";
+ public static final String DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_KEY = "dfs.client.failover.max.attempts";
+ public static final int DFS_CLIENT_FAILOVER_MAX_ATTEMPTS_DEFAULT = 15;
+ public static final String DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_KEY = "dfs.client.failover.sleep.base.millis";
+ public static final int DFS_CLIENT_FAILOVER_SLEEPTIME_BASE_DEFAULT = 500;
+ public static final String DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY = "dfs.client.failover.sleep.max.millis";
+ public static final int DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_DEFAULT = 15000;
+ public static final String DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_KEY = "dfs.client.failover.connection.retries";
+ public static final int DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_DEFAULT = 0;
+ public static final String DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_KEY = "dfs.client.failover.connection.retries.on.timeouts";
+ public static final int DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT = 0;
public static final String DFS_NAMENODE_BACKUP_ADDRESS_KEY = "dfs.namenode.backup.address";
public static final String DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT = "localhost:50100";
@@ -120,6 +133,8 @@
public static final boolean DFS_WEBHDFS_ENABLED_DEFAULT = false;
public static final String DFS_PERMISSIONS_ENABLED_KEY = "dfs.permissions.enabled";
public static final boolean DFS_PERMISSIONS_ENABLED_DEFAULT = true;
+ public static final String DFS_PERSIST_BLOCKS_KEY = "dfs.persist.blocks";
+ public static final boolean DFS_PERSIST_BLOCKS_DEFAULT = false;
public static final String DFS_PERMISSIONS_SUPERUSERGROUP_KEY = "dfs.permissions.superusergroup";
public static final String DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT = "supergroup";
public static final String DFS_ADMIN = "dfs.cluster.administrators";
@@ -131,6 +146,9 @@
public static final boolean DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT = true;
public static final String DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY = "dfs.namenode.num.checkpoints.retained";
public static final int DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT = 2;
+ public static final String DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY = "dfs.namenode.num.extra.edits.retained";
+ public static final int DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_DEFAULT = 1000000; //1M
+
public static final String DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY = "dfs.namenode.edits.dir.minimum";
public static final int DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT = 1;
@@ -150,6 +168,8 @@
public static final long DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT = 24*60*60*1000; // 1 day
public static final String DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY = "dfs.namenode.delegation.token.max-lifetime";
public static final long DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT = 7*24*60*60*1000; // 7 days
+ public static final String DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY = "dfs.namenode.delegation.token.always-use"; // for tests
+ public static final boolean DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT = false;
//Filesystem limit keys
public static final String DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY = "dfs.namenode.fs-limits.max-component-length";
@@ -165,6 +185,7 @@
public static final String DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT = "0.0.0.0:" + DFS_NAMENODE_HTTPS_PORT_DEFAULT;
public static final String DFS_NAMENODE_NAME_DIR_KEY = "dfs.namenode.name.dir";
public static final String DFS_NAMENODE_EDITS_DIR_KEY = "dfs.namenode.edits.dir";
+ public static final String DFS_NAMENODE_SHARED_EDITS_DIR_KEY = "dfs.namenode.shared.edits.dir";
public static final String DFS_NAMENODE_EDITS_PLUGIN_PREFIX = "dfs.namenode.edits.journal-plugin";
public static final String DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY = "dfs.namenode.edits.dir.required";
public static final String DFS_CLIENT_READ_PREFETCH_SIZE_KEY = "dfs.client.read.prefetch.size";
@@ -298,8 +319,8 @@
public static final String DFS_NAMENODE_NAME_CACHE_THRESHOLD_KEY = "dfs.namenode.name.cache.threshold";
public static final int DFS_NAMENODE_NAME_CACHE_THRESHOLD_DEFAULT = 10;
- public static final String DFS_FEDERATION_NAMESERVICES = "dfs.federation.nameservices";
- public static final String DFS_FEDERATION_NAMESERVICE_ID = "dfs.federation.nameservice.id";
+ public static final String DFS_FEDERATION_NAMESERVICES = "dfs.federation.nameservices";
+ public static final String DFS_FEDERATION_NAMESERVICE_ID = "dfs.federation.nameservice.id";
public static final String DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_KEY = "dfs.namenode.resource.check.interval";
public static final int DFS_NAMENODE_RESOURCE_CHECK_INTERVAL_DEFAULT = 5000;
public static final String DFS_NAMENODE_DU_RESERVED_KEY = "dfs.namenode.resource.du.reserved";
@@ -309,5 +330,16 @@
public static final int DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_DEFAULT = 1;
public static final String DFS_WEB_AUTHENTICATION_KERBEROS_PRINCIPAL_KEY = "dfs.web.authentication.kerberos.principal";
public static final String DFS_WEB_AUTHENTICATION_KERBEROS_KEYTAB_KEY = "dfs.web.authentication.kerberos.keytab";
+
public static final String DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY = "dfs.block.local-path-access.user";
+
+ // HA related configuration
+ public static final String DFS_HA_NAMENODES_KEY_PREFIX = "dfs.ha.namenodes";
+ public static final String DFS_HA_NAMENODE_ID_KEY = "dfs.ha.namenode.id";
+ public static final String DFS_HA_STANDBY_CHECKPOINTS_KEY = "dfs.ha.standby.checkpoints";
+ public static final boolean DFS_HA_STANDBY_CHECKPOINTS_DEFAULT = true;
+ public static final String DFS_HA_LOGROLL_PERIOD_KEY = "dfs.ha.log-roll.period";
+ public static final int DFS_HA_LOGROLL_PERIOD_DEFAULT = 2 * 60; // 2m
+ public static final String DFS_HA_TAILEDITS_PERIOD_KEY = "dfs.ha.tail-edits.period";
+ public static final int DFS_HA_TAILEDITS_PERIOD_DEFAULT = 60; // 1m
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java
index 7064616..cbc0f0e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSUtil.java
@@ -18,24 +18,21 @@
package org.apache.hadoop.hdfs;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICES;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY;
-
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.SecureRandom;
-import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.Comparator;
+import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Random;
+import java.util.Set;
import java.util.StringTokenizer;
import javax.net.SocketFactory;
@@ -46,9 +43,9 @@
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
-import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolTranslatorPB;
@@ -59,11 +56,19 @@
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import com.google.protobuf.BlockingService;
@InterfaceAudience.Private
public class DFSUtil {
+ public static final Log LOG = LogFactory.getLog(DFSUtil.class.getName());
+
+ private DFSUtil() { /* Hidden constructor */ }
private static final ThreadLocal<Random> RANDOM = new ThreadLocal<Random>() {
@Override
protected Random initialValue() {
@@ -101,13 +106,20 @@
a.isDecommissioned() ? 1 : -1;
}
};
+ /**
+ * Address matcher for matching an address to local address
+ */
+ static final AddressMatcher LOCAL_ADDRESS_MATCHER = new AddressMatcher() {
+ public boolean match(InetSocketAddress s) {
+ return NetUtils.isLocalAddress(s.getAddress());
+ };
+ };
/**
* Whether the pathname is valid. Currently prohibits relative paths,
* and names which contain a ":" or "/"
*/
public static boolean isValidName(String src) {
-
// Path must be absolute.
if (!src.startsWith(Path.SEPARATOR)) {
return false;
@@ -304,13 +316,39 @@
/**
* Returns collection of nameservice Ids from the configuration.
* @param conf configuration
- * @return collection of nameservice Ids
+ * @return collection of nameservice Ids, or null if not specified
*/
public static Collection<String> getNameServiceIds(Configuration conf) {
- return conf.getStringCollection(DFS_FEDERATION_NAMESERVICES);
+ return conf.getTrimmedStringCollection(DFS_FEDERATION_NAMESERVICES);
}
/**
+ * @return <code>coll</code> if it is non-null and non-empty. Otherwise,
+ * returns a list with a single null value.
+ */
+ private static Collection<String> emptyAsSingletonNull(Collection<String> coll) {
+ if (coll == null || coll.isEmpty()) {
+ return Collections.singletonList(null);
+ } else {
+ return coll;
+ }
+ }
+
+ /**
+ * Namenode HighAvailability related configuration.
+ * Returns collection of namenode Ids from the configuration. One logical id
+ * for each namenode in the in the HA setup.
+ *
+ * @param conf configuration
+ * @param nsId the nameservice ID to look at, or null for non-federated
+ * @return collection of namenode Ids
+ */
+ public static Collection<String> getNameNodeIds(Configuration conf, String nsId) {
+ String key = addSuffix(DFS_HA_NAMENODES_KEY_PREFIX, nsId);
+ return conf.getTrimmedStringCollection(key);
+ }
+
+ /**
* Given a list of keys in the order of preference, returns a value
* for the key in the given order from the configuration.
* @param defaultValue default value to return, when key was not found
@@ -323,9 +361,7 @@
Configuration conf, String... keys) {
String value = null;
for (String key : keys) {
- if (keySuffix != null) {
- key += "." + keySuffix;
- }
+ key = addSuffix(key, keySuffix);
value = conf.get(key);
if (value != null) {
break;
@@ -337,36 +373,84 @@
return value;
}
+ /** Add non empty and non null suffix to a key */
+ private static String addSuffix(String key, String suffix) {
+ if (suffix == null || suffix.isEmpty()) {
+ return key;
+ }
+ assert !suffix.startsWith(".") :
+ "suffix '" + suffix + "' should not already have '.' prepended.";
+ return key + "." + suffix;
+ }
+
+ /** Concatenate list of suffix strings '.' separated */
+ private static String concatSuffixes(String... suffixes) {
+ if (suffixes == null) {
+ return null;
+ }
+ return Joiner.on(".").skipNulls().join(suffixes);
+ }
+
/**
- * Returns list of InetSocketAddress for a given set of keys.
- * @param conf configuration
- * @param defaultAddress default address to return in case key is not found
- * @param keys Set of keys to look for in the order of preference
- * @return list of InetSocketAddress corresponding to the key
+ * Return configuration key of format key.suffix1.suffix2...suffixN
*/
- private static List<InetSocketAddress> getAddresses(Configuration conf,
+ public static String addKeySuffixes(String key, String... suffixes) {
+ String keySuffix = concatSuffixes(suffixes);
+ return addSuffix(key, keySuffix);
+ }
+
+ /**
+ * Returns the configured address for all NameNodes in the cluster.
+ * @param conf configuration
+ * @param defaultAddress default address to return in case key is not found.
+ * @param keys Set of keys to look for in the order of preference
+ * @return a map(nameserviceId to map(namenodeId to InetSocketAddress))
+ */
+ private static Map<String, Map<String, InetSocketAddress>>
+ getAddresses(Configuration conf,
String defaultAddress, String... keys) {
Collection<String> nameserviceIds = getNameServiceIds(conf);
- List<InetSocketAddress> isas = new ArrayList<InetSocketAddress>();
-
- // Configuration with a single namenode
- if (nameserviceIds == null || nameserviceIds.isEmpty()) {
- String address = getConfValue(defaultAddress, null, conf, keys);
- if (address == null) {
- return null;
- }
- isas.add(NetUtils.createSocketAddr(address));
- } else {
- // Get the namenodes for all the configured nameServiceIds
- for (String nameserviceId : nameserviceIds) {
- String address = getConfValue(null, nameserviceId, conf, keys);
- if (address == null) {
- return null;
- }
- isas.add(NetUtils.createSocketAddr(address));
+
+ // Look for configurations of the form <key>[.<nameserviceId>][.<namenodeId>]
+ // across all of the configured nameservices and namenodes.
+ Map<String, Map<String, InetSocketAddress>> ret = Maps.newHashMap();
+ for (String nsId : emptyAsSingletonNull(nameserviceIds)) {
+ Map<String, InetSocketAddress> isas =
+ getAddressesForNameserviceId(conf, nsId, defaultAddress, keys);
+ if (!isas.isEmpty()) {
+ ret.put(nsId, isas);
}
}
- return isas;
+ return ret;
+ }
+
+ private static Map<String, InetSocketAddress> getAddressesForNameserviceId(
+ Configuration conf, String nsId, String defaultValue,
+ String[] keys) {
+ Collection<String> nnIds = getNameNodeIds(conf, nsId);
+ Map<String, InetSocketAddress> ret = Maps.newHashMap();
+ for (String nnId : emptyAsSingletonNull(nnIds)) {
+ String suffix = concatSuffixes(nsId, nnId);
+ String address = getConfValue(defaultValue, suffix, conf, keys);
+ if (address != null) {
+ InetSocketAddress isa = NetUtils.createSocketAddr(address);
+ ret.put(nnId, isa);
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Returns list of InetSocketAddress corresponding to HA NN RPC addresses from
+ * the configuration.
+ *
+ * @param conf configuration
+ * @return list of InetSocketAddresses
+ * @throws IOException if no addresses are configured
+ */
+ public static Map<String, Map<String, InetSocketAddress>> getHaNnRpcAddresses(
+ Configuration conf) {
+ return getAddresses(conf, null, DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
}
/**
@@ -377,11 +461,11 @@
* @return list of InetSocketAddresses
* @throws IOException on error
*/
- public static List<InetSocketAddress> getBackupNodeAddresses(
+ public static Map<String, Map<String, InetSocketAddress>> getBackupNodeAddresses(
Configuration conf) throws IOException {
- List<InetSocketAddress> addressList = getAddresses(conf,
+ Map<String, Map<String, InetSocketAddress>> addressList = getAddresses(conf,
null, DFS_NAMENODE_BACKUP_ADDRESS_KEY);
- if (addressList == null) {
+ if (addressList.isEmpty()) {
throw new IOException("Incorrect configuration: backup node address "
+ DFS_NAMENODE_BACKUP_ADDRESS_KEY + " is not configured.");
}
@@ -396,11 +480,11 @@
* @return list of InetSocketAddresses
* @throws IOException on error
*/
- public static List<InetSocketAddress> getSecondaryNameNodeAddresses(
+ public static Map<String, Map<String, InetSocketAddress>> getSecondaryNameNodeAddresses(
Configuration conf) throws IOException {
- List<InetSocketAddress> addressList = getAddresses(conf, null,
+ Map<String, Map<String, InetSocketAddress>> addressList = getAddresses(conf, null,
DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY);
- if (addressList == null) {
+ if (addressList.isEmpty()) {
throw new IOException("Incorrect configuration: secondary namenode address "
+ DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY + " is not configured.");
}
@@ -420,7 +504,7 @@
* @return list of InetSocketAddress
* @throws IOException on error
*/
- public static List<InetSocketAddress> getNNServiceRpcAddresses(
+ public static Map<String, Map<String, InetSocketAddress>> getNNServiceRpcAddresses(
Configuration conf) throws IOException {
// Use default address as fall back
String defaultAddress;
@@ -430,9 +514,10 @@
defaultAddress = null;
}
- List<InetSocketAddress> addressList = getAddresses(conf, defaultAddress,
+ Map<String, Map<String, InetSocketAddress>> addressList =
+ getAddresses(conf, defaultAddress,
DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, DFS_NAMENODE_RPC_ADDRESS_KEY);
- if (addressList == null) {
+ if (addressList.isEmpty()) {
throw new IOException("Incorrect configuration: namenode address "
+ DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY + " or "
+ DFS_NAMENODE_RPC_ADDRESS_KEY
@@ -442,10 +527,154 @@
}
/**
- * Given the InetSocketAddress for any configured communication with a
- * namenode, this method returns the corresponding nameservice ID,
- * by doing a reverse lookup on the list of nameservices until it
- * finds a match.
+ * Flatten the given map, as returned by other functions in this class,
+ * into a flat list of {@link ConfiguredNNAddress} instances.
+ */
+ public static List<ConfiguredNNAddress> flattenAddressMap(
+ Map<String, Map<String, InetSocketAddress>> map) {
+ List<ConfiguredNNAddress> ret = Lists.newArrayList();
+
+ for (Map.Entry<String, Map<String, InetSocketAddress>> entry :
+ map.entrySet()) {
+ String nsId = entry.getKey();
+ Map<String, InetSocketAddress> nnMap = entry.getValue();
+ for (Map.Entry<String, InetSocketAddress> e2 : nnMap.entrySet()) {
+ String nnId = e2.getKey();
+ InetSocketAddress addr = e2.getValue();
+
+ ret.add(new ConfiguredNNAddress(nsId, nnId, addr));
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Format the given map, as returned by other functions in this class,
+ * into a string suitable for debugging display. The format of this string
+ * should not be considered an interface, and is liable to change.
+ */
+ public static String addressMapToString(
+ Map<String, Map<String, InetSocketAddress>> map) {
+ StringBuilder b = new StringBuilder();
+ for (Map.Entry<String, Map<String, InetSocketAddress>> entry :
+ map.entrySet()) {
+ String nsId = entry.getKey();
+ Map<String, InetSocketAddress> nnMap = entry.getValue();
+ b.append("Nameservice <").append(nsId).append(">:").append("\n");
+ for (Map.Entry<String, InetSocketAddress> e2 : nnMap.entrySet()) {
+ b.append(" NN ID ").append(e2.getKey())
+ .append(" => ").append(e2.getValue()).append("\n");
+ }
+ }
+ return b.toString();
+ }
+
+ public static String nnAddressesAsString(Configuration conf) {
+ Map<String, Map<String, InetSocketAddress>> addresses =
+ getHaNnRpcAddresses(conf);
+ return addressMapToString(addresses);
+ }
+
+ /**
+ * Represent one of the NameNodes configured in the cluster.
+ */
+ public static class ConfiguredNNAddress {
+ private final String nameserviceId;
+ private final String namenodeId;
+ private final InetSocketAddress addr;
+
+ private ConfiguredNNAddress(String nameserviceId, String namenodeId,
+ InetSocketAddress addr) {
+ this.nameserviceId = nameserviceId;
+ this.namenodeId = namenodeId;
+ this.addr = addr;
+ }
+
+ public String getNameserviceId() {
+ return nameserviceId;
+ }
+
+ public String getNamenodeId() {
+ return namenodeId;
+ }
+
+ public InetSocketAddress getAddress() {
+ return addr;
+ }
+
+ @Override
+ public String toString() {
+ return "ConfiguredNNAddress[nsId=" + nameserviceId + ";" +
+ "nnId=" + namenodeId + ";addr=" + addr + "]";
+ }
+ }
+
+ /**
+ * Get a URI for each configured nameservice. If a nameservice is
+ * HA-enabled, then the logical URI of the nameservice is returned. If the
+ * nameservice is not HA-enabled, then a URI corresponding to an RPC address
+ * of the single NN for that nameservice is returned, preferring the service
+ * RPC address over the client RPC address.
+ *
+ * @param conf configuration
+ * @return a collection of all configured NN URIs, preferring service
+ * addresses
+ */
+ public static Collection<URI> getNsServiceRpcUris(Configuration conf) {
+ return getNameServiceUris(conf,
+ DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
+ }
+
+ /**
+ * Get a URI for each configured nameservice. If a nameservice is
+ * HA-enabled, then the logical URI of the nameservice is returned. If the
+ * nameservice is not HA-enabled, then a URI corresponding to the address of
+ * the single NN for that nameservice is returned.
+ *
+ * @param conf configuration
+ * @param keys configuration keys to try in order to get the URI for non-HA
+ * nameservices
+ * @return a collection of all configured NN URIs
+ */
+ public static Collection<URI> getNameServiceUris(Configuration conf,
+ String... keys) {
+ Set<URI> ret = new HashSet<URI>();
+ for (String nsId : getNameServiceIds(conf)) {
+ if (HAUtil.isHAEnabled(conf, nsId)) {
+ // Add the logical URI of the nameservice.
+ try {
+ ret.add(new URI(HdfsConstants.HDFS_URI_SCHEME + "://" + nsId));
+ } catch (URISyntaxException ue) {
+ throw new IllegalArgumentException(ue);
+ }
+ } else {
+ // Add the URI corresponding to the address of the NN.
+ for (String key : keys) {
+ String addr = conf.get(concatSuffixes(key, nsId));
+ if (addr != null) {
+ ret.add(createUri(HdfsConstants.HDFS_URI_SCHEME,
+ NetUtils.createSocketAddr(addr)));
+ break;
+ }
+ }
+ }
+ }
+ // Add the generic configuration keys.
+ for (String key : keys) {
+ String addr = conf.get(key);
+ if (addr != null) {
+ ret.add(createUri("hdfs", NetUtils.createSocketAddr(addr)));
+ break;
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Given the InetSocketAddress this method returns the nameservice Id
+ * corresponding to the key with matching address, by doing a reverse
+ * lookup on the list of nameservices until it finds a match.
*
* Since the process of resolving URIs to Addresses is slightly expensive,
* this utility method should not be used in performance-critical routines.
@@ -463,91 +692,109 @@
* not the NameServiceId-suffixed keys.
* @return nameserviceId, or null if no match found
*/
- public static String getNameServiceIdFromAddress(Configuration conf,
- InetSocketAddress address, String... keys) {
- Collection<String> nameserviceIds = getNameServiceIds(conf);
-
+ public static String getNameServiceIdFromAddress(final Configuration conf,
+ final InetSocketAddress address, String... keys) {
// Configuration with a single namenode and no nameserviceId
- if (nameserviceIds == null || nameserviceIds.isEmpty()) {
- return null;
- }
- // Get the candidateAddresses for all the configured nameServiceIds
- for (String nameserviceId : nameserviceIds) {
- for (String key : keys) {
- String candidateAddress = conf.get(
- getNameServiceIdKey(key, nameserviceId));
- if (candidateAddress != null
- && address.equals(NetUtils.createSocketAddr(candidateAddress)))
- return nameserviceId;
- }
- }
- // didn't find a match
- return null;
+ String[] ids = getSuffixIDs(conf, address, keys);
+ return (ids != null) ? ids[0] : null;
}
-
+
/**
- * return server http or https address from the configuration
+ * return server http or https address from the configuration for a
+ * given namenode rpc address.
* @param conf
- * @param namenode - namenode address
+ * @param namenodeAddr - namenode RPC address
* @param httpsAddress -If true, and if security is enabled, returns server
* https address. If false, returns server http address.
* @return server http or https address
*/
public static String getInfoServer(
- InetSocketAddress namenode, Configuration conf, boolean httpsAddress) {
- String httpAddress = null;
-
- String httpAddressKey = (UserGroupInformation.isSecurityEnabled()
- && httpsAddress) ? DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_KEY
- : DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY;
- String httpAddressDefault = (UserGroupInformation.isSecurityEnabled()
- && httpsAddress) ? DFSConfigKeys.DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT
- : DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_DEFAULT;
- if(namenode != null) {
+ InetSocketAddress namenodeAddr, Configuration conf, boolean httpsAddress) {
+ boolean securityOn = UserGroupInformation.isSecurityEnabled();
+ String httpAddressKey = (securityOn && httpsAddress) ?
+ DFS_NAMENODE_HTTPS_ADDRESS_KEY : DFS_NAMENODE_HTTP_ADDRESS_KEY;
+ String httpAddressDefault = (securityOn && httpsAddress) ?
+ DFS_NAMENODE_HTTPS_ADDRESS_DEFAULT : DFS_NAMENODE_HTTP_ADDRESS_DEFAULT;
+
+ String suffixes[];
+ if (namenodeAddr != null) {
// if non-default namenode, try reverse look up
// the nameServiceID if it is available
- String nameServiceId = DFSUtil.getNameServiceIdFromAddress(
- conf, namenode,
+ suffixes = getSuffixIDs(conf, namenodeAddr,
DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
-
- if (nameServiceId != null) {
- httpAddress = conf.get(DFSUtil.getNameServiceIdKey(
- httpAddressKey, nameServiceId));
- }
- }
- // else - Use non-federation style configuration
- if (httpAddress == null) {
- httpAddress = conf.get(httpAddressKey, httpAddressDefault);
+ } else {
+ suffixes = new String[2];
}
- return httpAddress;
+ return getSuffixedConf(conf, httpAddressKey, httpAddressDefault, suffixes);
}
+
/**
- * @return key specific to a nameserviceId from a generic key
+ * Substitute a default host in the case that an address has been configured
+ * with a wildcard. This is used, for example, when determining the HTTP
+ * address of the NN -- if it's configured to bind to 0.0.0.0, we want to
+ * substitute the hostname from the filesystem URI rather than trying to
+ * connect to 0.0.0.0.
+ * @param configuredAddress the address found in the configuration
+ * @param defaultHost the host to substitute with, if configuredAddress
+ * is a local/wildcard address.
+ * @return the substituted address
+ * @throws IOException if it is a wildcard address and security is enabled
*/
- public static String getNameServiceIdKey(String key, String nameserviceId) {
- return key + "." + nameserviceId;
+ public static String substituteForWildcardAddress(String configuredAddress,
+ String defaultHost) throws IOException {
+ InetSocketAddress sockAddr = NetUtils.createSocketAddr(configuredAddress);
+ if (sockAddr.getAddress().isAnyLocalAddress()) {
+ if(UserGroupInformation.isSecurityEnabled()) {
+ throw new IOException("Cannot use a wildcard address with security. " +
+ "Must explicitly set bind address for Kerberos");
+ }
+ return defaultHost + ":" + sockAddr.getPort();
+ } else {
+ return configuredAddress;
+ }
+ }
+
+ private static String getSuffixedConf(Configuration conf,
+ String key, String defaultVal, String[] suffixes) {
+ String ret = conf.get(DFSUtil.addKeySuffixes(key, suffixes));
+ if (ret != null) {
+ return ret;
+ }
+ return conf.get(key, defaultVal);
}
/**
* Sets the node specific setting into generic configuration key. Looks up
- * value of "key.nameserviceId" and if found sets that value into generic key
- * in the conf. Note that this only modifies the runtime conf.
+ * value of "key.nameserviceId.namenodeId" and if found sets that value into
+ * generic key in the conf. If this is not found, falls back to
+ * "key.nameserviceId" and then the unmodified key.
+ *
+ * Note that this only modifies the runtime conf.
*
* @param conf
* Configuration object to lookup specific key and to set the value
* to the key passed. Note the conf object is modified.
* @param nameserviceId
- * nameservice Id to construct the node specific key.
+ * nameservice Id to construct the node specific key. Pass null if
+ * federation is not configuration.
+ * @param nnId
+ * namenode Id to construct the node specific key. Pass null if
+ * HA is not configured.
* @param keys
* The key for which node specific value is looked up
*/
public static void setGenericConf(Configuration conf,
- String nameserviceId, String... keys) {
+ String nameserviceId, String nnId, String... keys) {
for (String key : keys) {
- String value = conf.get(getNameServiceIdKey(key, nameserviceId));
+ String value = conf.get(addKeySuffixes(key, nameserviceId, nnId));
+ if (value != null) {
+ conf.set(key, value);
+ continue;
+ }
+ value = conf.get(addKeySuffixes(key, nameserviceId));
if (value != null) {
conf.set(key, value);
}
@@ -572,34 +819,7 @@
public static int roundBytesToGB(long bytes) {
return Math.round((float)bytes/ 1024 / 1024 / 1024);
}
-
-
- /** Create a {@link NameNode} proxy */
- public static ClientProtocol createNamenode(Configuration conf)
- throws IOException {
- return createNamenode(NameNode.getAddress(conf), conf);
- }
-
- /** Create a {@link NameNode} proxy */
- public static ClientProtocol createNamenode( InetSocketAddress nameNodeAddr,
- Configuration conf) throws IOException {
- return createNamenode(nameNodeAddr, conf,
- UserGroupInformation.getCurrentUser());
- }
-
- /** Create a {@link NameNode} proxy */
- public static ClientProtocol createNamenode( InetSocketAddress nameNodeAddr,
- Configuration conf, UserGroupInformation ugi) throws IOException {
- /**
- * Currently we have simply burnt-in support for a SINGLE
- * protocol - protocolPB. This will be replaced
- * by a way to pick the right protocol based on the
- * version of the target server.
- */
- return new org.apache.hadoop.hdfs.protocolPB.
- ClientNamenodeProtocolTranslatorPB(nameNodeAddr, conf, ugi);
- }
-
+
/** Create a {@link ClientDatanodeProtocol} proxy */
public static ClientDatanodeProtocol createClientDatanodeProtocolProxy(
DatanodeID datanodeid, Configuration conf, int socketTimeout,
@@ -622,9 +842,9 @@
SocketFactory factory) throws IOException {
return new ClientDatanodeProtocolTranslatorPB(addr, ticket, conf, factory);
}
-
+
/**
- * Get name service Id for the {@link NameNode} based on namenode RPC address
+ * Get nameservice Id for the {@link NameNode} based on namenode RPC address
* matching the local node address.
*/
public static String getNamenodeNameServiceId(Configuration conf) {
@@ -632,7 +852,7 @@
}
/**
- * Get name service Id for the BackupNode based on backup node RPC address
+ * Get nameservice Id for the BackupNode based on backup node RPC address
* matching the local node address.
*/
public static String getBackupNameServiceId(Configuration conf) {
@@ -640,7 +860,7 @@
}
/**
- * Get name service Id for the secondary node based on secondary http address
+ * Get nameservice Id for the secondary node based on secondary http address
* matching the local node address.
*/
public static String getSecondaryNameServiceId(Configuration conf) {
@@ -652,13 +872,14 @@
* the address of the local node.
*
* If {@link DFSConfigKeys#DFS_FEDERATION_NAMESERVICE_ID} is not specifically
- * configured, this method determines the nameservice Id by matching the local
- * nodes address with the configured addresses. When a match is found, it
- * returns the nameservice Id from the corresponding configuration key.
+ * configured, and more than one nameservice Id is configured, this method
+ * determines the nameservice Id by matching the local node's address with the
+ * configured addresses. When a match is found, it returns the nameservice Id
+ * from the corresponding configuration key.
*
* @param conf Configuration
* @param addressKey configuration key to get the address.
- * @return name service Id on success, null on failure.
+ * @return nameservice Id on success, null if federation is not configured.
* @throws HadoopIllegalArgumentException on error
*/
private static String getNameServiceId(Configuration conf, String addressKey) {
@@ -666,34 +887,106 @@
if (nameserviceId != null) {
return nameserviceId;
}
-
- Collection<String> ids = getNameServiceIds(conf);
- if (ids == null || ids.size() == 0) {
- // Not federation configuration, hence no nameservice Id
- return null;
+ Collection<String> nsIds = getNameServiceIds(conf);
+ if (1 == nsIds.size()) {
+ return nsIds.toArray(new String[1])[0];
}
+ String nnId = conf.get(DFS_HA_NAMENODE_ID_KEY);
- // Match the rpc address with that of local address
+ return getSuffixIDs(conf, addressKey, null, nnId, LOCAL_ADDRESS_MATCHER)[0];
+ }
+
+ /**
+ * Returns nameservice Id and namenode Id when the local host matches the
+ * configuration parameter {@code addressKey}.<nameservice Id>.<namenode Id>
+ *
+ * @param conf Configuration
+ * @param addressKey configuration key corresponding to the address.
+ * @param knownNsId only look at configs for the given nameservice, if not-null
+ * @param knownNNId only look at configs for the given namenode, if not null
+ * @param matcher matching criteria for matching the address
+ * @return Array with nameservice Id and namenode Id on success. First element
+ * in the array is nameservice Id and second element is namenode Id.
+ * Null value indicates that the configuration does not have the the
+ * Id.
+ * @throws HadoopIllegalArgumentException on error
+ */
+ static String[] getSuffixIDs(final Configuration conf, final String addressKey,
+ String knownNsId, String knownNNId,
+ final AddressMatcher matcher) {
+ String nameserviceId = null;
+ String namenodeId = null;
int found = 0;
- for (String id : ids) {
- String addr = conf.get(getNameServiceIdKey(addressKey, id));
- InetSocketAddress s = NetUtils.createSocketAddr(addr);
- if (NetUtils.isLocalAddress(s.getAddress())) {
- nameserviceId = id;
- found++;
+
+ Collection<String> nsIds = getNameServiceIds(conf);
+ for (String nsId : emptyAsSingletonNull(nsIds)) {
+ if (knownNsId != null && !knownNsId.equals(nsId)) {
+ continue;
+ }
+
+ Collection<String> nnIds = getNameNodeIds(conf, nsId);
+ for (String nnId : emptyAsSingletonNull(nnIds)) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(String.format("addressKey: %s nsId: %s nnId: %s",
+ addressKey, nsId, nnId));
+ }
+ if (knownNNId != null && !knownNNId.equals(nnId)) {
+ continue;
+ }
+ String key = addKeySuffixes(addressKey, nsId, nnId);
+ String addr = conf.get(key);
+ if (addr == null) {
+ continue;
+ }
+ InetSocketAddress s = null;
+ try {
+ s = NetUtils.createSocketAddr(addr);
+ } catch (Exception e) {
+ LOG.warn("Exception in creating socket address " + addr, e);
+ continue;
+ }
+ if (!s.isUnresolved() && matcher.match(s)) {
+ nameserviceId = nsId;
+ namenodeId = nnId;
+ found++;
+ }
}
}
if (found > 1) { // Only one address must match the local address
- throw new HadoopIllegalArgumentException(
- "Configuration has multiple RPC addresses that matches "
- + "the local node's address. Please configure the system with "
- + "the parameter " + DFS_FEDERATION_NAMESERVICE_ID);
+ String msg = "Configuration has multiple addresses that match "
+ + "local node's address. Please configure the system with "
+ + DFS_FEDERATION_NAMESERVICE_ID + " and "
+ + DFS_HA_NAMENODE_ID_KEY;
+ throw new HadoopIllegalArgumentException(msg);
}
- if (found == 0) {
- throw new HadoopIllegalArgumentException("Configuration address "
- + addressKey + " is missing in configuration with name service Id");
+ return new String[] { nameserviceId, namenodeId };
+ }
+
+ /**
+ * For given set of {@code keys} adds nameservice Id and or namenode Id
+ * and returns {nameserviceId, namenodeId} when address match is found.
+ * @see #getSuffixIDs(Configuration, String, AddressMatcher)
+ */
+ static String[] getSuffixIDs(final Configuration conf,
+ final InetSocketAddress address, final String... keys) {
+ AddressMatcher matcher = new AddressMatcher() {
+ @Override
+ public boolean match(InetSocketAddress s) {
+ return address.equals(s);
+ }
+ };
+
+ for (String key : keys) {
+ String[] ids = getSuffixIDs(conf, key, null, null, matcher);
+ if (ids != null && (ids [0] != null || ids[1] != null)) {
+ return ids;
+ }
}
- return nameserviceId;
+ return null;
+ }
+
+ private interface AddressMatcher {
+ public boolean match(InetSocketAddress s);
}
/** Create a URI from the scheme and address */
@@ -719,4 +1012,39 @@
RPC.setProtocolEngine(conf, protocol, ProtobufRpcEngine.class);
server.addProtocol(RpcKind.RPC_PROTOCOL_BUFFER, protocol, service);
}
+
+ /**
+ * Map a logical namenode ID to its service address. Use the given
+ * nameservice if specified, or the configured one if none is given.
+ *
+ * @param conf Configuration
+ * @param nsId which nameservice nnId is a part of, optional
+ * @param nnId the namenode ID to get the service addr for
+ * @return the service addr, null if it could not be determined
+ */
+ public static String getNamenodeServiceAddr(final Configuration conf,
+ String nsId, String nnId) {
+
+ if (nsId == null) {
+ Collection<String> nsIds = getNameServiceIds(conf);
+ if (1 == nsIds.size()) {
+ nsId = nsIds.toArray(new String[1])[0];
+ } else {
+ // No nameservice ID was given and more than one is configured
+ return null;
+ }
+ }
+
+ String serviceAddrKey = concatSuffixes(
+ DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nsId, nnId);
+
+ String addrKey = concatSuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, nnId);
+
+ String serviceRpcAddr = conf.get(serviceAddrKey);
+ if (serviceRpcAddr == null) {
+ serviceRpcAddr = conf.get(addrKey);
+ }
+ return serviceRpcAddr;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
index 119bca9..8dfced3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@@ -106,8 +106,7 @@
throw new IOException("Incomplete HDFS URI, no host: "+ uri);
}
- InetSocketAddress namenode = NameNode.getAddress(uri.getAuthority());
- this.dfs = new DFSClient(namenode, conf, statistics);
+ this.dfs = new DFSClient(uri, conf, statistics);
this.uri = URI.create(uri.getScheme()+"://"+uri.getAuthority());
this.workingDir = getHomeDirectory();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java
new file mode 100644
index 0000000..34e9d2e
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HAUtil.java
@@ -0,0 +1,261 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSelector;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+import static org.apache.hadoop.hdfs.protocol.HdfsConstants.HA_DT_SERVICE_PREFIX;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+public class HAUtil {
+
+ private static final Log LOG =
+ LogFactory.getLog(HAUtil.class);
+
+ private HAUtil() { /* Hidden constructor */ }
+
+ /**
+ * Returns true if HA for namenode is configured for the given nameservice
+ *
+ * @param conf Configuration
+ * @param nsId nameservice, or null if no federated NS is configured
+ * @return true if HA is configured in the configuration; else false.
+ */
+ public static boolean isHAEnabled(Configuration conf, String nsId) {
+ Map<String, Map<String, InetSocketAddress>> addresses =
+ DFSUtil.getHaNnRpcAddresses(conf);
+ if (addresses == null) return false;
+ Map<String, InetSocketAddress> nnMap = addresses.get(nsId);
+ return nnMap != null && nnMap.size() > 1;
+ }
+
+ /**
+ * Returns true if HA is using a shared edits directory.
+ *
+ * @param conf Configuration
+ * @return true if HA config is using a shared edits dir, false otherwise.
+ */
+ public static boolean usesSharedEditsDir(Configuration conf) {
+ return null != conf.get(DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
+ }
+
+ /**
+ * Get the namenode Id by matching the {@code addressKey}
+ * with the the address of the local node.
+ *
+ * If {@link DFSConfigKeys#DFS_HA_NAMENODE_ID_KEY} is not specifically
+ * configured, this method determines the namenode Id by matching the local
+ * node's address with the configured addresses. When a match is found, it
+ * returns the namenode Id from the corresponding configuration key.
+ *
+ * @param conf Configuration
+ * @return namenode Id on success, null on failure.
+ * @throws HadoopIllegalArgumentException on error
+ */
+ public static String getNameNodeId(Configuration conf, String nsId) {
+ String namenodeId = conf.getTrimmed(DFS_HA_NAMENODE_ID_KEY);
+ if (namenodeId != null) {
+ return namenodeId;
+ }
+
+ String suffixes[] = DFSUtil.getSuffixIDs(conf, DFS_NAMENODE_RPC_ADDRESS_KEY,
+ nsId, null, DFSUtil.LOCAL_ADDRESS_MATCHER);
+ if (suffixes == null) {
+ String msg = "Configuration " + DFS_NAMENODE_RPC_ADDRESS_KEY +
+ " must be suffixed with nameservice and namenode ID for HA " +
+ "configuration.";
+ throw new HadoopIllegalArgumentException(msg);
+ }
+
+ return suffixes[1];
+ }
+
+ /**
+ * Similar to
+ * {@link DFSUtil#getNameServiceIdFromAddress(Configuration,
+ * InetSocketAddress, String...)}
+ */
+ public static String getNameNodeIdFromAddress(final Configuration conf,
+ final InetSocketAddress address, String... keys) {
+ // Configuration with a single namenode and no nameserviceId
+ String[] ids = DFSUtil.getSuffixIDs(conf, address, keys);
+ if (ids != null && ids.length > 1) {
+ return ids[1];
+ }
+ return null;
+ }
+
+ /**
+ * Given the configuration for this node, return a Configuration object for
+ * the other node in an HA setup.
+ *
+ * @param myConf the configuration of this node
+ * @return the configuration of the other node in an HA setup
+ */
+ public static Configuration getConfForOtherNode(
+ Configuration myConf) {
+
+ String nsId = DFSUtil.getNamenodeNameServiceId(myConf);
+ Preconditions.checkArgument(nsId != null,
+ "Could not determine namespace id. Please ensure that this " +
+ "machine is one of the machines listed as a NN RPC address, " +
+ "or configure " + DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID);
+
+ Collection<String> nnIds = DFSUtil.getNameNodeIds(myConf, nsId);
+ String myNNId = myConf.get(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY);
+ Preconditions.checkArgument(nnIds != null,
+ "Could not determine namenode ids in namespace '%s'. " +
+ "Please configure " +
+ DFSUtil.addKeySuffixes(DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX,
+ nsId),
+ nsId);
+ Preconditions.checkArgument(nnIds.size() == 2,
+ "Expected exactly 2 NameNodes in namespace '%s'. " +
+ "Instead, got only %s (NN ids were '%s'",
+ nsId, nnIds.size(), Joiner.on("','").join(nnIds));
+ Preconditions.checkState(myNNId != null && !myNNId.isEmpty(),
+ "Could not determine own NN ID in namespace '%s'. Please " +
+ "ensure that this node is one of the machines listed as an " +
+ "NN RPC address, or configure " + DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY,
+ nsId);
+
+ ArrayList<String> nnSet = Lists.newArrayList(nnIds);
+ nnSet.remove(myNNId);
+ assert nnSet.size() == 1;
+ String activeNN = nnSet.get(0);
+
+ // Look up the address of the active NN.
+ Configuration confForOtherNode = new Configuration(myConf);
+ NameNode.initializeGenericKeys(confForOtherNode, nsId, activeNN);
+ return confForOtherNode;
+ }
+
+ /**
+ * This is used only by tests at the moment.
+ * @return true if the NN should allow read operations while in standby mode.
+ */
+ public static boolean shouldAllowStandbyReads(Configuration conf) {
+ return conf.getBoolean("dfs.ha.allow.stale.reads", false);
+ }
+
+ public static void setAllowStandbyReads(Configuration conf, boolean val) {
+ conf.setBoolean("dfs.ha.allow.stale.reads", val);
+ }
+
+ /**
+ * @return true if the given nameNodeUri appears to be a logical URI.
+ * This is the case if there is a failover proxy provider configured
+ * for it in the given configuration.
+ */
+ public static boolean isLogicalUri(
+ Configuration conf, URI nameNodeUri) {
+ String host = nameNodeUri.getHost();
+ String configKey = DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "."
+ + host;
+ return conf.get(configKey) != null;
+ }
+
+ /**
+ * Parse the HDFS URI out of the provided token.
+ * @throws IOException if the token is invalid
+ */
+ public static URI getServiceUriFromToken(
+ Token<DelegationTokenIdentifier> token)
+ throws IOException {
+ String tokStr = token.getService().toString();
+
+ if (tokStr.startsWith(HA_DT_SERVICE_PREFIX)) {
+ tokStr = tokStr.replaceFirst(HA_DT_SERVICE_PREFIX, "");
+ }
+
+ try {
+ return new URI(HdfsConstants.HDFS_URI_SCHEME + "://" +
+ tokStr);
+ } catch (URISyntaxException e) {
+ throw new IOException("Invalid token contents: '" +
+ tokStr + "'");
+ }
+ }
+
+ /**
+ * Get the service name used in the delegation token for the given logical
+ * HA service.
+ * @param uri the logical URI of the cluster
+ * @return the service name
+ */
+ public static Text buildTokenServiceForLogicalUri(URI uri) {
+ return new Text(HA_DT_SERVICE_PREFIX + uri.getHost());
+ }
+
+ /**
+ * @return true if this token corresponds to a logical nameservice
+ * rather than a specific namenode.
+ */
+ public static boolean isTokenForLogicalUri(
+ Token<DelegationTokenIdentifier> token) {
+ return token.getService().toString().startsWith(HA_DT_SERVICE_PREFIX);
+ }
+
+ /**
+ * Locate a delegation token associated with the given HA cluster URI, and if
+ * one is found, clone it to also represent the underlying namenode address.
+ * @param ugi the UGI to modify
+ * @param haUri the logical URI for the cluster
+ * @param singleNNAddr one of the NNs in the cluster to which the token
+ * applies
+ */
+ public static void cloneDelegationTokenForLogicalUri(
+ UserGroupInformation ugi, URI haUri,
+ InetSocketAddress singleNNAddr) {
+ Text haService = buildTokenServiceForLogicalUri(haUri);
+ Token<DelegationTokenIdentifier> haToken =
+ DelegationTokenSelector.selectHdfsDelegationToken(haService, ugi);
+ if (haToken == null) {
+ // no token
+ return;
+ }
+ Token<DelegationTokenIdentifier> specificToken =
+ new Token<DelegationTokenIdentifier>(haToken);
+ specificToken.setService(SecurityUtil.buildTokenService(singleNNAddr));
+ ugi.addToken(specificToken);
+ LOG.debug("Mapped HA service delegation token for logical URI " +
+ haUri + " to namenode " + singleNNAddr);
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java
index 27702b5..6e21245 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/HDFSPolicyProvider.java
@@ -19,6 +19,7 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
@@ -44,6 +45,8 @@
new Service("security.inter.datanode.protocol.acl",
InterDatanodeProtocol.class),
new Service("security.namenode.protocol.acl", NamenodeProtocol.class),
+ new Service(CommonConfigurationKeys.SECURITY_HA_SERVICE_PROTOCOL_ACL,
+ HAServiceProtocol.class),
new Service(
CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_AUTHORIZATION_REFRESH_POLICY,
RefreshAuthorizationPolicyProtocol.class),
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java
new file mode 100644
index 0000000..650c313
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/NameNodeProxies.java
@@ -0,0 +1,333 @@
+package org.apache.hadoop.hdfs;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSClient.Conf;
+import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
+import org.apache.hadoop.hdfs.protocol.ClientProtocol;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.JournalProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.JournalProtocolTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolPB;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.retry.FailoverProxyProvider;
+import org.apache.hadoop.io.retry.RetryPolicies;
+import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryProxy;
+import org.apache.hadoop.ipc.ProtobufRpcEngine;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.RefreshUserMappingsProtocol;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
+import org.apache.hadoop.tools.GetUserMappingsProtocol;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Create proxy objects to communicate with a remote NN. All remote access to an
+ * NN should be funneled through this class. Most of the time you'll want to use
+ * {@link NameNodeProxies#createProxy(Configuration, URI, Class)}, which will
+ * create either an HA- or non-HA-enabled client proxy as appropriate.
+ */
+public class NameNodeProxies {
+
+ private static final Log LOG = LogFactory.getLog(NameNodeProxies.class);
+
+ /**
+ * Wrapper for a client proxy as well as its associated service ID.
+ * This is simply used as a tuple-like return type for
+ * {@link NameNodeProxies#createProxy} and
+ * {@link NameNodeProxies#createNonHAProxy}.
+ */
+ public static class ProxyAndInfo<PROXYTYPE> {
+ private final PROXYTYPE proxy;
+ private final Text dtService;
+
+ public ProxyAndInfo(PROXYTYPE proxy, Text dtService) {
+ this.proxy = proxy;
+ this.dtService = dtService;
+ }
+
+ public PROXYTYPE getProxy() {
+ return proxy;
+ }
+
+ public Text getDelegationTokenService() {
+ return dtService;
+ }
+ }
+
+ /**
+ * Creates the namenode proxy with the passed protocol. This will handle
+ * creation of either HA- or non-HA-enabled proxy objects, depending upon
+ * if the provided URI is a configured logical URI.
+ *
+ * @param conf the configuration containing the required IPC
+ * properties, client failover configurations, etc.
+ * @param nameNodeUri the URI pointing either to a specific NameNode
+ * or to a logical nameservice.
+ * @param xface the IPC interface which should be created
+ * @return an object containing both the proxy and the associated
+ * delegation token service it corresponds to
+ * @throws IOException if there is an error creating the proxy
+ **/
+ @SuppressWarnings("unchecked")
+ public static <T> ProxyAndInfo<T> createProxy(Configuration conf,
+ URI nameNodeUri, Class<T> xface) throws IOException {
+ Class<FailoverProxyProvider<T>> failoverProxyProviderClass =
+ getFailoverProxyProviderClass(conf, nameNodeUri, xface);
+
+ if (failoverProxyProviderClass == null) {
+ // Non-HA case
+ return createNonHAProxy(conf, NameNode.getAddress(nameNodeUri), xface,
+ UserGroupInformation.getCurrentUser(), true);
+ } else {
+ // HA case
+ FailoverProxyProvider<T> failoverProxyProvider = NameNodeProxies
+ .createFailoverProxyProvider(conf, failoverProxyProviderClass, xface,
+ nameNodeUri);
+ Conf config = new Conf(conf);
+ T proxy = (T) RetryProxy.create(xface, failoverProxyProvider, RetryPolicies
+ .failoverOnNetworkException(RetryPolicies.TRY_ONCE_THEN_FAIL,
+ config.maxFailoverAttempts, config.failoverSleepBaseMillis,
+ config.failoverSleepMaxMillis));
+
+ Text dtService = HAUtil.buildTokenServiceForLogicalUri(nameNodeUri);
+ return new ProxyAndInfo<T>(proxy, dtService);
+ }
+ }
+
+ /**
+ * Creates an explicitly non-HA-enabled proxy object. Most of the time you
+ * don't want to use this, and should instead use {@link NameNodeProxies#createProxy}.
+ *
+ * @param conf the configuration object
+ * @param nnAddr address of the remote NN to connect to
+ * @param xface the IPC interface which should be created
+ * @param ugi the user who is making the calls on the proxy object
+ * @param withRetries certain interfaces have a non-standard retry policy
+ * @return an object containing both the proxy and the associated
+ * delegation token service it corresponds to
+ * @throws IOException
+ */
+ @SuppressWarnings("unchecked")
+ public static <T> ProxyAndInfo<T> createNonHAProxy(
+ Configuration conf, InetSocketAddress nnAddr, Class<T> xface,
+ UserGroupInformation ugi, boolean withRetries) throws IOException {
+ Text dtService = SecurityUtil.buildTokenService(nnAddr);
+
+ T proxy;
+ if (xface == ClientProtocol.class) {
+ proxy = (T) createNNProxyWithClientProtocol(nnAddr, conf, ugi,
+ withRetries);
+ } else if (xface == JournalProtocol.class) {
+ proxy = (T) createNNProxyWithJournalProtocol(nnAddr, conf, ugi);
+ } else if (xface == NamenodeProtocol.class) {
+ proxy = (T) createNNProxyWithNamenodeProtocol(nnAddr, conf, ugi,
+ withRetries);
+ } else if (xface == GetUserMappingsProtocol.class) {
+ proxy = (T) createNNProxyWithGetUserMappingsProtocol(nnAddr, conf, ugi);
+ } else if (xface == RefreshUserMappingsProtocol.class) {
+ proxy = (T) createNNProxyWithRefreshUserMappingsProtocol(nnAddr, conf, ugi);
+ } else if (xface == RefreshAuthorizationPolicyProtocol.class) {
+ proxy = (T) createNNProxyWithRefreshAuthorizationPolicyProtocol(nnAddr,
+ conf, ugi);
+ } else {
+ String message = "Upsupported protocol found when creating the proxy " +
+ "connection to NameNode: " +
+ ((xface != null) ? xface.getClass().getName() : "null");
+ LOG.error(message);
+ throw new IllegalStateException(message);
+ }
+ return new ProxyAndInfo<T>(proxy, dtService);
+ }
+
+ private static JournalProtocol createNNProxyWithJournalProtocol(
+ InetSocketAddress address, Configuration conf, UserGroupInformation ugi)
+ throws IOException {
+ JournalProtocolPB proxy = (JournalProtocolPB) createNameNodeProxy(address,
+ conf, ugi, JournalProtocolPB.class);
+ return new JournalProtocolTranslatorPB(proxy);
+ }
+
+ private static RefreshAuthorizationPolicyProtocol
+ createNNProxyWithRefreshAuthorizationPolicyProtocol(InetSocketAddress address,
+ Configuration conf, UserGroupInformation ugi) throws IOException {
+ RefreshAuthorizationPolicyProtocolPB proxy = (RefreshAuthorizationPolicyProtocolPB)
+ createNameNodeProxy(address, conf, ugi, RefreshAuthorizationPolicyProtocolPB.class);
+ return new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB(proxy);
+ }
+
+ private static RefreshUserMappingsProtocol
+ createNNProxyWithRefreshUserMappingsProtocol(InetSocketAddress address,
+ Configuration conf, UserGroupInformation ugi) throws IOException {
+ RefreshUserMappingsProtocolPB proxy = (RefreshUserMappingsProtocolPB)
+ createNameNodeProxy(address, conf, ugi, RefreshUserMappingsProtocolPB.class);
+ return new RefreshUserMappingsProtocolClientSideTranslatorPB(proxy);
+ }
+
+ private static GetUserMappingsProtocol createNNProxyWithGetUserMappingsProtocol(
+ InetSocketAddress address, Configuration conf, UserGroupInformation ugi)
+ throws IOException {
+ GetUserMappingsProtocolPB proxy = (GetUserMappingsProtocolPB)
+ createNameNodeProxy(address, conf, ugi, GetUserMappingsProtocolPB.class);
+ return new GetUserMappingsProtocolClientSideTranslatorPB(proxy);
+ }
+
+ private static NamenodeProtocol createNNProxyWithNamenodeProtocol(
+ InetSocketAddress address, Configuration conf, UserGroupInformation ugi,
+ boolean withRetries) throws IOException {
+ NamenodeProtocolPB proxy = (NamenodeProtocolPB) createNameNodeProxy(
+ address, conf, ugi, NamenodeProtocolPB.class);
+ if (withRetries) { // create the proxy with retries
+ RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry(5, 200,
+ TimeUnit.MILLISECONDS);
+ Map<Class<? extends Exception>, RetryPolicy> exceptionToPolicyMap
+ = new HashMap<Class<? extends Exception>, RetryPolicy>();
+ RetryPolicy methodPolicy = RetryPolicies.retryByException(timeoutPolicy,
+ exceptionToPolicyMap);
+ Map<String, RetryPolicy> methodNameToPolicyMap
+ = new HashMap<String, RetryPolicy>();
+ methodNameToPolicyMap.put("getBlocks", methodPolicy);
+ methodNameToPolicyMap.put("getAccessKeys", methodPolicy);
+ proxy = (NamenodeProtocolPB) RetryProxy.create(NamenodeProtocolPB.class,
+ proxy, methodNameToPolicyMap);
+ }
+ return new NamenodeProtocolTranslatorPB(proxy);
+ }
+
+ private static ClientProtocol createNNProxyWithClientProtocol(
+ InetSocketAddress address, Configuration conf, UserGroupInformation ugi,
+ boolean withRetries) throws IOException {
+ ClientNamenodeProtocolPB proxy = (ClientNamenodeProtocolPB) NameNodeProxies
+ .createNameNodeProxy(address, conf, ugi, ClientNamenodeProtocolPB.class);
+ if (withRetries) { // create the proxy with retries
+ RetryPolicy createPolicy = RetryPolicies
+ .retryUpToMaximumCountWithFixedSleep(5,
+ HdfsConstants.LEASE_SOFTLIMIT_PERIOD, TimeUnit.MILLISECONDS);
+
+ Map<Class<? extends Exception>, RetryPolicy> remoteExceptionToPolicyMap
+ = new HashMap<Class<? extends Exception>, RetryPolicy>();
+ remoteExceptionToPolicyMap.put(AlreadyBeingCreatedException.class,
+ createPolicy);
+
+ Map<Class<? extends Exception>, RetryPolicy> exceptionToPolicyMap
+ = new HashMap<Class<? extends Exception>, RetryPolicy>();
+ exceptionToPolicyMap.put(RemoteException.class, RetryPolicies
+ .retryByRemoteException(RetryPolicies.TRY_ONCE_THEN_FAIL,
+ remoteExceptionToPolicyMap));
+ RetryPolicy methodPolicy = RetryPolicies.retryByException(
+ RetryPolicies.TRY_ONCE_THEN_FAIL, exceptionToPolicyMap);
+ Map<String, RetryPolicy> methodNameToPolicyMap
+ = new HashMap<String, RetryPolicy>();
+
+ methodNameToPolicyMap.put("create", methodPolicy);
+
+ proxy = (ClientNamenodeProtocolPB) RetryProxy
+ .create(ClientNamenodeProtocolPB.class, proxy, methodNameToPolicyMap);
+ }
+ return new ClientNamenodeProtocolTranslatorPB(proxy);
+ }
+
+ @SuppressWarnings("unchecked")
+ private static Object createNameNodeProxy(InetSocketAddress address,
+ Configuration conf, UserGroupInformation ugi, Class xface)
+ throws IOException {
+ RPC.setProtocolEngine(conf, xface, ProtobufRpcEngine.class);
+ Object proxy = RPC.getProxy(xface, RPC.getProtocolVersion(xface), address,
+ ugi, conf, NetUtils.getDefaultSocketFactory(conf));
+ return proxy;
+ }
+
+ /** Gets the configured Failover proxy provider's class */
+ private static <T> Class<FailoverProxyProvider<T>> getFailoverProxyProviderClass(
+ Configuration conf, URI nameNodeUri, Class<T> xface) throws IOException {
+ if (nameNodeUri == null) {
+ return null;
+ }
+ String host = nameNodeUri.getHost();
+
+ String configKey = DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "."
+ + host;
+ try {
+ @SuppressWarnings("unchecked")
+ Class<FailoverProxyProvider<T>> ret = (Class<FailoverProxyProvider<T>>) conf
+ .getClass(configKey, null, FailoverProxyProvider.class);
+ if (ret != null) {
+ // If we found a proxy provider, then this URI should be a logical NN.
+ // Given that, it shouldn't have a non-default port number.
+ int port = nameNodeUri.getPort();
+ if (port > 0 && port != NameNode.DEFAULT_PORT) {
+ throw new IOException("Port " + port + " specified in URI "
+ + nameNodeUri + " but host '" + host
+ + "' is a logical (HA) namenode"
+ + " and does not use port information.");
+ }
+ }
+ return ret;
+ } catch (RuntimeException e) {
+ if (e.getCause() instanceof ClassNotFoundException) {
+ throw new IOException("Could not load failover proxy provider class "
+ + conf.get(configKey) + " which is configured for authority "
+ + nameNodeUri, e);
+ } else {
+ throw e;
+ }
+ }
+ }
+
+ /** Creates the Failover proxy provider instance*/
+ @SuppressWarnings("unchecked")
+ private static <T> FailoverProxyProvider<T> createFailoverProxyProvider(
+ Configuration conf, Class<FailoverProxyProvider<T>> failoverProxyProviderClass,
+ Class<T> xface, URI nameNodeUri) throws IOException {
+ Preconditions.checkArgument(
+ xface.isAssignableFrom(NamenodeProtocols.class),
+ "Interface %s is not a NameNode protocol", xface);
+ try {
+ Constructor<FailoverProxyProvider<T>> ctor = failoverProxyProviderClass
+ .getConstructor(Configuration.class, URI.class, Class.class);
+ FailoverProxyProvider<?> provider = ctor.newInstance(conf, nameNodeUri,
+ xface);
+ return (FailoverProxyProvider<T>) provider;
+ } catch (Exception e) {
+ String message = "Couldn't create proxy provider " + failoverProxyProviderClass;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(message, e);
+ }
+ if (e.getCause() instanceof IOException) {
+ throw (IOException) e.getCause();
+ } else {
+ throw new IOException(message, e);
+ }
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java
index e1006a6..58af5fd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/BlockListAsLongs.java
@@ -40,7 +40,7 @@
* - followed by the invalid replica represented with three -1s;
* - followed by the under-construction replica list where each replica is
* represented by 4 longs: three for the block id, length, generation
- * stamp, and the forth for the replica state.
+ * stamp, and the fourth for the replica state.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
@@ -304,4 +304,16 @@
blockList[idx+1] = -1;
blockList[idx+2] = -1;
}
+
+ public long getMaxGsInBlockList() {
+ long maxGs = -1;
+ Iterator<Block> iter = getBlockReportIterator();
+ while (iter.hasNext()) {
+ Block b = iter.next();
+ if (b.getGenerationStamp() > maxGs) {
+ maxGs = b.getGenerationStamp();
+ }
+ }
+ return maxGs;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
index ab6babc..099fd28 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java
@@ -39,6 +39,7 @@
import org.apache.hadoop.hdfs.server.namenode.SafeModeException;
import org.apache.hadoop.io.EnumSetWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.retry.Idempotent;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.KerberosInfo;
import org.apache.hadoop.security.token.Token;
@@ -114,6 +115,7 @@
* @throws UnresolvedLinkException If <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public LocatedBlocks getBlockLocations(String src,
long offset,
long length)
@@ -125,6 +127,7 @@
* @return a set of server default configuration values
* @throws IOException
*/
+ @Idempotent
public FsServerDefaults getServerDefaults() throws IOException;
/**
@@ -228,6 +231,7 @@
* @throws UnresolvedLinkException if <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public boolean setReplication(String src, short replication)
throws AccessControlException, DSQuotaExceededException,
FileNotFoundException, SafeModeException, UnresolvedLinkException,
@@ -242,6 +246,7 @@
* @throws UnresolvedLinkException If <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void setPermission(String src, FsPermission permission)
throws AccessControlException, FileNotFoundException, SafeModeException,
UnresolvedLinkException, IOException;
@@ -259,12 +264,13 @@
* @throws UnresolvedLinkException If <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void setOwner(String src, String username, String groupname)
throws AccessControlException, FileNotFoundException, SafeModeException,
UnresolvedLinkException, IOException;
/**
- * The client can give up on a blcok by calling abandonBlock().
+ * The client can give up on a block by calling abandonBlock().
* The client can then
* either obtain a new block, or complete or abandon the file.
* Any partial writes to the block will be discarded.
@@ -331,6 +337,7 @@
* @throws UnresolvedLinkException If <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk,
final DatanodeInfo[] existings, final DatanodeInfo[] excludes,
final int numAdditionalNodes, final String clientName
@@ -368,6 +375,7 @@
* locations on datanodes).
* @param blocks Array of located blocks to report
*/
+ @Idempotent
public void reportBadBlocks(LocatedBlock[] blocks) throws IOException;
///////////////////////////////////////
@@ -482,6 +490,7 @@
* RunTimeExceptions:
* @throws InvalidPathException If <code>src</code> is invalid
*/
+ @Idempotent
public boolean mkdirs(String src, FsPermission masked, boolean createParent)
throws AccessControlException, FileAlreadyExistsException,
FileNotFoundException, NSQuotaExceededException,
@@ -502,6 +511,7 @@
* @throws UnresolvedLinkException If <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public DirectoryListing getListing(String src,
byte[] startAfter,
boolean needLocation)
@@ -531,6 +541,7 @@
* @throws AccessControlException permission denied
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void renewLease(String clientName) throws AccessControlException,
IOException;
@@ -543,6 +554,7 @@
* @return true if the file is already closed
* @throws IOException
*/
+ @Idempotent
public boolean recoverLease(String src, String clientName) throws IOException;
public int GET_STATS_CAPACITY_IDX = 0;
@@ -554,7 +566,7 @@
/**
* Get a set of statistics about the filesystem.
- * Right now, only three values are returned.
+ * Right now, only seven values are returned.
* <ul>
* <li> [0] contains the total storage capacity of the system, in bytes.</li>
* <li> [1] contains the total used space of the system, in bytes.</li>
@@ -567,6 +579,7 @@
* Use public constants like {@link #GET_STATS_CAPACITY_IDX} in place of
* actual numbers to index into the array.
*/
+ @Idempotent
public long[] getStats() throws IOException;
/**
@@ -575,6 +588,7 @@
* Return live datanodes if type is LIVE; dead datanodes if type is DEAD;
* otherwise all datanodes if type is ALL.
*/
+ @Idempotent
public DatanodeInfo[] getDatanodeReport(HdfsConstants.DatanodeReportType type)
throws IOException;
@@ -585,6 +599,7 @@
* @throws IOException
* @throws UnresolvedLinkException if the path contains a symlink.
*/
+ @Idempotent
public long getPreferredBlockSize(String filename)
throws IOException, UnresolvedLinkException;
@@ -700,9 +715,9 @@
* all corrupt files, call this method repeatedly and each time pass in the
* cookie returned from the previous call.
*/
- public CorruptFileBlocks
- listCorruptFileBlocks(String path, String cookie)
- throws IOException;
+ @Idempotent
+ public CorruptFileBlocks listCorruptFileBlocks(String path, String cookie)
+ throws IOException;
/**
* Dumps namenode data structures into specified file. If the file
@@ -719,6 +734,7 @@
* @param bandwidth Blanacer bandwidth in bytes per second for this datanode.
* @throws IOException
*/
+ @Idempotent
public void setBalancerBandwidth(long bandwidth) throws IOException;
/**
@@ -732,6 +748,7 @@
* @throws UnresolvedLinkException if the path contains a symlink.
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public HdfsFileStatus getFileInfo(String src) throws AccessControlException,
FileNotFoundException, UnresolvedLinkException, IOException;
@@ -747,6 +764,7 @@
* @throws UnresolvedLinkException if <code>src</code> contains a symlink
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public HdfsFileStatus getFileLinkInfo(String src)
throws AccessControlException, UnresolvedLinkException, IOException;
@@ -759,6 +777,7 @@
* @throws UnresolvedLinkException if <code>path</code> contains a symlink.
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public ContentSummary getContentSummary(String path)
throws AccessControlException, FileNotFoundException,
UnresolvedLinkException, IOException;
@@ -784,6 +803,7 @@
* @throws UnresolvedLinkException if the <code>path</code> contains a symlink.
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void setQuota(String path, long namespaceQuota, long diskspaceQuota)
throws AccessControlException, FileNotFoundException,
UnresolvedLinkException, IOException;
@@ -799,6 +819,7 @@
* @throws UnresolvedLinkException if <code>src</code> contains a symlink.
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void fsync(String src, String client)
throws AccessControlException, FileNotFoundException,
UnresolvedLinkException, IOException;
@@ -818,6 +839,7 @@
* @throws UnresolvedLinkException if <code>src</code> contains a symlink.
* @throws IOException If an I/O error occurred
*/
+ @Idempotent
public void setTimes(String src, long mtime, long atime)
throws AccessControlException, FileNotFoundException,
UnresolvedLinkException, IOException;
@@ -858,6 +880,7 @@
* @throws IOException If the given path does not refer to a symlink
* or an I/O error occurred
*/
+ @Idempotent
public String getLinkTarget(String path) throws AccessControlException,
FileNotFoundException, IOException;
@@ -873,6 +896,7 @@
* @return a located block with a new generation stamp and an access token
* @throws IOException if any error occurs
*/
+ @Idempotent
public LocatedBlock updateBlockForPipeline(ExtendedBlock block,
String clientName) throws IOException;
@@ -896,6 +920,7 @@
* @return Token<DelegationTokenIdentifier>
* @throws IOException
*/
+ @Idempotent
public Token<DelegationTokenIdentifier> getDelegationToken(Text renewer)
throws IOException;
@@ -906,6 +931,7 @@
* @return the new expiration time
* @throws IOException
*/
+ @Idempotent
public long renewDelegationToken(Token<DelegationTokenIdentifier> token)
throws IOException;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java
index 6b4835f..da64b9e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/HdfsConstants.java
@@ -100,6 +100,14 @@
public static final String HDFS_URI_SCHEME = "hdfs";
/**
+ * A prefix put before the namenode URI inside the "service" field
+ * of a delgation token, indicating that the URI is a logical (HA)
+ * URI.
+ */
+ public static final String HA_DT_SERVICE_PREFIX = "ha-hdfs:";
+
+
+ /**
* Please see {@link LayoutVersion} on adding new layout version.
*/
public static final int LAYOUT_VERSION = LayoutVersion
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
index 729748f..3680ee5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
@@ -91,7 +91,10 @@
STORED_TXIDS(-37, "Transaction IDs are stored in edits log and image files"),
TXID_BASED_LAYOUT(-38, "File names in NN Storage are based on transaction IDs"),
EDITLOG_OP_OPTIMIZATION(-39,
- "Use LongWritable and ShortWritable directly instead of ArrayWritable of UTF8");
+ "Use LongWritable and ShortWritable directly instead of ArrayWritable of UTF8"),
+ OPTIMIZE_PERSIST_BLOCKS(-40,
+ "Serialize block lists with delta-encoded variable length ints, " +
+ "add OP_UPDATE_BLOCKS");
final int lv;
final int ancestorLV;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java
index f6a63fb..7382543 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientDatanodeProtocolTranslatorPB.java
@@ -45,6 +45,7 @@
import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
+import org.apache.hadoop.ipc.ProtocolTranslator;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcClientUtil;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
@@ -63,7 +64,8 @@
@InterfaceAudience.Private
@InterfaceStability.Stable
public class ClientDatanodeProtocolTranslatorPB implements
- ProtocolMetaInterface, ClientDatanodeProtocol, Closeable {
+ ProtocolMetaInterface, ClientDatanodeProtocol,
+ ProtocolTranslator, Closeable {
public static final Log LOG = LogFactory
.getLog(ClientDatanodeProtocolTranslatorPB.class);
@@ -198,4 +200,9 @@
ClientDatanodeProtocolPB.class, RpcKind.RPC_PROTOCOL_BUFFER,
RPC.getProtocolVersion(ClientDatanodeProtocolPB.class), methodName);
}
+
+ @Override
+ public Object getUnderlyingProxyObject() {
+ return rpcProxy;
+ }
}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
index 3691584..46a3c82 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/ClientNamenodeProtocolTranslatorPB.java
@@ -20,15 +20,10 @@
import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.IOException;
-import java.net.InetSocketAddress;
import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.TimeUnit;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FileAlreadyExistsException;
@@ -49,6 +44,7 @@
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.UpgradeAction;
+import org.apache.hadoop.ipc.ProtocolTranslator;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
@@ -137,52 +133,14 @@
@InterfaceAudience.Private
@InterfaceStability.Stable
public class ClientNamenodeProtocolTranslatorPB implements
- ProtocolMetaInterface, ClientProtocol, Closeable {
+ ProtocolMetaInterface, ClientProtocol, Closeable, ProtocolTranslator {
final private ClientNamenodeProtocolPB rpcProxy;
- private static ClientNamenodeProtocolPB createNamenode(
- InetSocketAddress nameNodeAddr, Configuration conf,
- UserGroupInformation ugi) throws IOException {
- RPC.setProtocolEngine(conf, ClientNamenodeProtocolPB.class,
- ProtobufRpcEngine.class);
- return RPC.getProxy(ClientNamenodeProtocolPB.class,
- RPC.getProtocolVersion(ClientNamenodeProtocolPB.class), nameNodeAddr, ugi, conf,
- NetUtils.getSocketFactory(conf, ClientNamenodeProtocolPB.class));
+ public ClientNamenodeProtocolTranslatorPB(ClientNamenodeProtocolPB proxy)
+ throws IOException {
+ rpcProxy = proxy;
}
-
- /** Create a {@link NameNode} proxy */
- static ClientNamenodeProtocolPB createNamenodeWithRetry(
- ClientNamenodeProtocolPB rpcNamenode) {
- RetryPolicy createPolicy = RetryPolicies
- .retryUpToMaximumCountWithFixedSleep(5,
- HdfsConstants.LEASE_SOFTLIMIT_PERIOD, TimeUnit.MILLISECONDS);
-
- Map<Class<? extends Exception>, RetryPolicy> remoteExceptionToPolicyMap
- = new HashMap<Class<? extends Exception>, RetryPolicy>();
- remoteExceptionToPolicyMap.put(AlreadyBeingCreatedException.class,
- createPolicy);
-
- Map<Class<? extends Exception>, RetryPolicy> exceptionToPolicyMap =
- new HashMap<Class<? extends Exception>, RetryPolicy>();
- exceptionToPolicyMap.put(RemoteException.class, RetryPolicies
- .retryByRemoteException(RetryPolicies.TRY_ONCE_THEN_FAIL,
- remoteExceptionToPolicyMap));
- RetryPolicy methodPolicy = RetryPolicies.retryByException(
- RetryPolicies.TRY_ONCE_THEN_FAIL, exceptionToPolicyMap);
- Map<String, RetryPolicy> methodNameToPolicyMap = new HashMap<String, RetryPolicy>();
-
- methodNameToPolicyMap.put("create", methodPolicy);
-
- return (ClientNamenodeProtocolPB) RetryProxy.create(
- ClientNamenodeProtocolPB.class, rpcNamenode, methodNameToPolicyMap);
- }
-
- public ClientNamenodeProtocolTranslatorPB(InetSocketAddress nameNodeAddr,
- Configuration conf, UserGroupInformation ugi) throws IOException {
-
- rpcProxy = createNamenodeWithRetry(createNamenode(nameNodeAddr, conf, ugi));
- }
-
+
public void close() {
RPC.stopProxy(rpcProxy);
}
@@ -866,4 +824,9 @@
ClientNamenodeProtocolPB.class, RpcKind.RPC_PROTOCOL_BUFFER,
RPC.getProtocolVersion(ClientNamenodeProtocolPB.class), methodName);
}
+
+ @Override
+ public Object getUnderlyingProxyObject() {
+ return rpcProxy;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
index d47eac2..2a661c0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolClientSideTranslatorPB.java
@@ -41,6 +41,7 @@
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ErrorReportRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatResponseProto;
+import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.NNHAStatusHeartbeatProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ProcessUpgradeRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ProcessUpgradeResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.RegisterDatanodeRequestProto;
@@ -55,6 +56,7 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
@@ -161,7 +163,7 @@
}
@Override
- public DatanodeCommand[] sendHeartbeat(DatanodeRegistration registration,
+ public HeartbeatResponse sendHeartbeat(DatanodeRegistration registration,
StorageReport[] reports, int xmitsInProgress, int xceiverCount,
int failedVolumes) throws IOException {
HeartbeatRequestProto.Builder builder = HeartbeatRequestProto.newBuilder()
@@ -184,7 +186,7 @@
cmds[index] = PBHelper.convert(p);
index++;
}
- return cmds;
+ return new HeartbeatResponse(cmds, PBHelper.convert(resp.getHaStatus()));
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
index 413bd3a..c653daa 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/DatanodeProtocolServerSideTranslatorPB.java
@@ -51,6 +51,7 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
@@ -103,7 +104,7 @@
@Override
public HeartbeatResponseProto sendHeartbeat(RpcController controller,
HeartbeatRequestProto request) throws ServiceException {
- DatanodeCommand[] cmds = null;
+ HeartbeatResponse response;
try {
List<StorageReportProto> list = request.getReportsList();
StorageReport[] report = new StorageReport[list.size()];
@@ -113,7 +114,7 @@
p.getCapacity(), p.getDfsUsed(), p.getRemaining(),
p.getBlockPoolUsed());
}
- cmds = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()),
+ response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()),
report, request.getXmitsInProgress(), request.getXceiverCount(),
request.getFailedVolumes());
} catch (IOException e) {
@@ -121,6 +122,7 @@
}
HeartbeatResponseProto.Builder builder = HeartbeatResponseProto
.newBuilder();
+ DatanodeCommand[] cmds = response.getCommands();
if (cmds != null) {
for (int i = 0; i < cmds.length; i++) {
if (cmds[i] != null) {
@@ -128,6 +130,7 @@
}
}
}
+ builder.setHaStatus(PBHelper.convert(response.getNameNodeHaState()));
return builder.build();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java
index c29595e..01bd88e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/GetUserMappingsProtocolClientSideTranslatorPB.java
@@ -20,22 +20,15 @@
import java.io.Closeable;
import java.io.IOException;
-import java.net.InetSocketAddress;
-
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.GetUserMappingsProtocolProtos.GetGroupsForUserRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.GetUserMappingsProtocolProtos.GetGroupsForUserResponseProto;
import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.ipc.ProtobufHelper;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcClientUtil;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
-import org.apache.hadoop.net.NetUtils;
-import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.tools.GetUserMappingsProtocol;
import com.google.protobuf.RpcController;
@@ -47,16 +40,10 @@
/** RpcController is not used and hence is set to null */
private final static RpcController NULL_CONTROLLER = null;
private final GetUserMappingsProtocolPB rpcProxy;
-
+
public GetUserMappingsProtocolClientSideTranslatorPB(
- InetSocketAddress nameNodeAddr, UserGroupInformation ugi,
- Configuration conf) throws IOException {
- RPC.setProtocolEngine(conf, GetUserMappingsProtocolPB.class,
- ProtobufRpcEngine.class);
- rpcProxy = RPC.getProxy(GetUserMappingsProtocolPB.class,
- RPC.getProtocolVersion(GetUserMappingsProtocolPB.class),
- NameNode.getAddress(conf), ugi, conf,
- NetUtils.getSocketFactory(conf, GetUserMappingsProtocol.class));
+ GetUserMappingsProtocolPB rpcProxy) {
+ this.rpcProxy = rpcProxy;
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java
index 0735cfd..76ca46f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/JournalProtocolTranslatorPB.java
@@ -19,17 +19,14 @@
import java.io.Closeable;
import java.io.IOException;
-import java.net.InetSocketAddress;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.StartLogSegmentRequestProto;
import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable;
import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtobufHelper;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
@@ -52,12 +49,9 @@
/** RpcController is not used and hence is set to null */
private final static RpcController NULL_CONTROLLER = null;
private final JournalProtocolPB rpcProxy;
-
- public JournalProtocolTranslatorPB(InetSocketAddress nameNodeAddr,
- Configuration conf) throws IOException {
- RPC.setProtocolEngine(conf, JournalProtocolPB.class, ProtobufRpcEngine.class);
- rpcProxy = RPC.getProxy(JournalProtocolPB.class,
- RPC.getProtocolVersion(JournalProtocolPB.class), nameNodeAddr, conf);
+
+ public JournalProtocolTranslatorPB(JournalProtocolPB rpcProxy) {
+ this.rpcProxy = rpcProxy;
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java
index f2ec7ba..7de2c0e4 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/NamenodeProtocolTranslatorPB.java
@@ -19,11 +19,9 @@
import java.io.Closeable;
import java.io.IOException;
-import java.net.InetSocketAddress;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.NamenodeCommandProto;
@@ -47,14 +45,11 @@
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.ipc.ProtobufHelper;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcClientUtil;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
-import org.apache.hadoop.net.NetUtils;
-import org.apache.hadoop.security.UserGroupInformation;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
@@ -84,15 +79,6 @@
VersionRequestProto.newBuilder().build();
final private NamenodeProtocolPB rpcProxy;
-
- public NamenodeProtocolTranslatorPB(InetSocketAddress nameNodeAddr,
- Configuration conf, UserGroupInformation ugi) throws IOException {
- RPC.setProtocolEngine(conf, NamenodeProtocolPB.class,
- ProtobufRpcEngine.class);
- rpcProxy = RPC.getProxy(NamenodeProtocolPB.class,
- RPC.getProtocolVersion(NamenodeProtocolPB.class), nameNodeAddr, ugi,
- conf, NetUtils.getSocketFactory(conf, NamenodeProtocolPB.class));
- }
public NamenodeProtocolTranslatorPB(NamenodeProtocolPB rpcProxy) {
this.rpcProxy = rpcProxy;
@@ -137,7 +123,6 @@
}
@Override
- @SuppressWarnings("deprecation")
public CheckpointSignature rollEditLog() throws IOException {
try {
return PBHelper.convert(rpcProxy.rollEditLog(NULL_CONTROLLER,
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
index fab9f1f..b1e7be0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java
@@ -57,6 +57,7 @@
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.DatanodeStorageProto.StorageState;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.FinalizeCommandProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.KeyUpdateCommandProto;
+import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.NNHAStatusHeartbeatProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReceivedDeletedBlockInfoProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.RegisterCommandProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReportProto;
@@ -119,7 +120,9 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
+import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus;
import org.apache.hadoop.hdfs.server.protocol.RegisterCommand;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
@@ -819,6 +822,23 @@
ReceivedDeletedBlockInfoProto.Builder builder =
ReceivedDeletedBlockInfoProto.newBuilder();
+ ReceivedDeletedBlockInfoProto.BlockStatus status;
+ switch (receivedDeletedBlockInfo.getStatus()) {
+ case RECEIVING_BLOCK:
+ status = ReceivedDeletedBlockInfoProto.BlockStatus.RECEIVING;
+ break;
+ case RECEIVED_BLOCK:
+ status = ReceivedDeletedBlockInfoProto.BlockStatus.RECEIVED;
+ break;
+ case DELETED_BLOCK:
+ status = ReceivedDeletedBlockInfoProto.BlockStatus.DELETED;
+ break;
+ default:
+ throw new IllegalArgumentException("Bad status: " +
+ receivedDeletedBlockInfo.getStatus());
+ }
+ builder.setStatus(status);
+
if (receivedDeletedBlockInfo.getDelHints() != null) {
builder.setDeleteHint(receivedDeletedBlockInfo.getDelHints());
}
@@ -850,7 +870,21 @@
public static ReceivedDeletedBlockInfo convert(
ReceivedDeletedBlockInfoProto proto) {
- return new ReceivedDeletedBlockInfo(PBHelper.convert(proto.getBlock()),
+ ReceivedDeletedBlockInfo.BlockStatus status = null;
+ switch (proto.getStatus()) {
+ case RECEIVING:
+ status = BlockStatus.RECEIVING_BLOCK;
+ break;
+ case RECEIVED:
+ status = BlockStatus.RECEIVED_BLOCK;
+ break;
+ case DELETED:
+ status = BlockStatus.DELETED_BLOCK;
+ break;
+ }
+ return new ReceivedDeletedBlockInfo(
+ PBHelper.convert(proto.getBlock()),
+ status,
proto.hasDeleteHint() ? proto.getDeleteHint() : null);
}
@@ -1245,6 +1279,37 @@
build();
}
+ public static NNHAStatusHeartbeat convert(NNHAStatusHeartbeatProto s) {
+ if (s == null) return null;
+ switch (s.getState()) {
+ case ACTIVE:
+ return new NNHAStatusHeartbeat(NNHAStatusHeartbeat.State.ACTIVE, s.getTxid());
+ case STANDBY:
+ return new NNHAStatusHeartbeat(NNHAStatusHeartbeat.State.STANDBY, s.getTxid());
+ default:
+ throw new IllegalArgumentException("Unexpected NNHAStatusHeartbeat.State:" + s.getState());
+ }
+ }
+
+ public static NNHAStatusHeartbeatProto convert(NNHAStatusHeartbeat hb) {
+ if (hb == null) return null;
+ NNHAStatusHeartbeatProto.Builder builder =
+ NNHAStatusHeartbeatProto.newBuilder();
+ switch (hb.getState()) {
+ case ACTIVE:
+ builder.setState(NNHAStatusHeartbeatProto.State.ACTIVE);
+ break;
+ case STANDBY:
+ builder.setState(NNHAStatusHeartbeatProto.State.STANDBY);
+ break;
+ default:
+ throw new IllegalArgumentException("Unexpected NNHAStatusHeartbeat.State:" +
+ hb.getState());
+ }
+ builder.setTxid(hb.getTxId());
+ return builder.build();
+ }
+
public static DatanodeStorageProto convert(DatanodeStorage s) {
return DatanodeStorageProto.newBuilder()
.setState(PBHelper.convert(s.getState()))
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java
index 0fcf424..96ba2cf 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshAuthorizationPolicyProtocolClientSideTranslatorPB.java
@@ -20,21 +20,15 @@
import java.io.Closeable;
import java.io.IOException;
-import java.net.InetSocketAddress;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.RefreshAuthorizationPolicyProtocolProtos.RefreshServiceAclRequestProto;
import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.ipc.ProtobufHelper;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcClientUtil;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
-import org.apache.hadoop.net.NetUtils;
-import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
import com.google.protobuf.RpcController;
@@ -46,16 +40,10 @@
/** RpcController is not used and hence is set to null */
private final static RpcController NULL_CONTROLLER = null;
private final RefreshAuthorizationPolicyProtocolPB rpcProxy;
-
+
public RefreshAuthorizationPolicyProtocolClientSideTranslatorPB(
- InetSocketAddress nameNodeAddr, UserGroupInformation ugi,
- Configuration conf) throws IOException {
- RPC.setProtocolEngine(conf, RefreshAuthorizationPolicyProtocolPB.class,
- ProtobufRpcEngine.class);
- rpcProxy = RPC.getProxy(RefreshAuthorizationPolicyProtocolPB.class,
- RPC.getProtocolVersion(RefreshAuthorizationPolicyProtocolPB.class),
- NameNode.getAddress(conf), ugi, conf,
- NetUtils.getSocketFactory(conf, RefreshAuthorizationPolicyProtocol.class));
+ RefreshAuthorizationPolicyProtocolPB rpcProxy) {
+ this.rpcProxy = rpcProxy;
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java
index eb8e059..6f07617 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/RefreshUserMappingsProtocolClientSideTranslatorPB.java
@@ -20,23 +20,17 @@
import java.io.Closeable;
import java.io.IOException;
-import java.net.InetSocketAddress;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.RefreshUserMappingsProtocolProtos.RefreshSuperUserGroupsConfigurationRequestProto;
import org.apache.hadoop.hdfs.protocol.proto.RefreshUserMappingsProtocolProtos.RefreshUserToGroupsMappingsRequestProto;
import org.apache.hadoop.hdfs.protocolR23Compatible.ProtocolSignatureWritable;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.ipc.ProtobufHelper;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.ProtocolMetaInterface;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RpcClientUtil;
import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
-import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.RefreshUserMappingsProtocol;
-import org.apache.hadoop.security.UserGroupInformation;
import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException;
@@ -47,16 +41,10 @@
/** RpcController is not used and hence is set to null */
private final static RpcController NULL_CONTROLLER = null;
private final RefreshUserMappingsProtocolPB rpcProxy;
-
+
public RefreshUserMappingsProtocolClientSideTranslatorPB(
- InetSocketAddress nameNodeAddr, UserGroupInformation ugi,
- Configuration conf) throws IOException {
- RPC.setProtocolEngine(conf, RefreshUserMappingsProtocolPB.class,
- ProtobufRpcEngine.class);
- rpcProxy = RPC.getProxy(RefreshUserMappingsProtocolPB.class,
- RPC.getProtocolVersion(RefreshUserMappingsProtocolPB.class),
- NameNode.getAddress(conf), ugi, conf,
- NetUtils.getSocketFactory(conf, RefreshUserMappingsProtocol.class));
+ RefreshUserMappingsProtocolPB rpcProxy) {
+ this.rpcProxy = rpcProxy;
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java
index b5f24d1..ba62a2c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java
@@ -21,6 +21,7 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
+import java.io.InterruptedIOException;
import java.net.InetSocketAddress;
import java.util.Iterator;
@@ -283,7 +284,18 @@
@Override //AbstractDelegationTokenManager
protected void logUpdateMasterKey(DelegationKey key)
throws IOException {
- namesystem.logUpdateMasterKey(key);
+ synchronized (noInterruptsLock) {
+ // The edit logging code will fail catastrophically if it
+ // is interrupted during a logSync, since the interrupt
+ // closes the edit log files. Doing this inside the
+ // above lock and then checking interruption status
+ // prevents this bug.
+ if (Thread.interrupted()) {
+ throw new InterruptedIOException(
+ "Interrupted before updating master key");
+ }
+ namesystem.logUpdateMasterKey(key);
+ }
}
/** A utility method for creating credentials. */
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java
index 1822b27..4f73b85 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSelector.java
@@ -59,6 +59,11 @@
new InetSocketAddress(nnAddr.getHostName(), nnRpcPort));
return INSTANCE.selectToken(serviceName, ugi.getTokens());
}
+
+ public static Token<DelegationTokenIdentifier> selectHdfsDelegationToken(
+ Text serviceName, UserGroupInformation ugi) {
+ return INSTANCE.selectToken(serviceName, ugi.getTokens());
+ }
public DelegationTokenSelector() {
super(DelegationTokenIdentifier.HDFS_DELEGATION_KIND);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
index a0146e7..e808af6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java
@@ -24,8 +24,8 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
-import java.net.InetSocketAddress;
import java.net.Socket;
+import java.net.URI;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Arrays;
@@ -1379,7 +1379,7 @@
* for each namenode,
* execute a {@link Balancer} to work through all datanodes once.
*/
- static int run(List<InetSocketAddress> namenodes, final Parameters p,
+ static int run(Collection<URI> namenodes, final Parameters p,
Configuration conf) throws IOException, InterruptedException {
final long sleeptime = 2000*conf.getLong(
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
@@ -1393,8 +1393,8 @@
final List<NameNodeConnector> connectors
= new ArrayList<NameNodeConnector>(namenodes.size());
try {
- for(InetSocketAddress isa : namenodes) {
- connectors.add(new NameNodeConnector(isa, conf));
+ for (URI uri : namenodes) {
+ connectors.add(new NameNodeConnector(uri, conf));
}
boolean done = false;
@@ -1476,7 +1476,7 @@
try {
checkReplicationPolicyCompatibility(conf);
- final List<InetSocketAddress> namenodes = DFSUtil.getNNServiceRpcAddresses(conf);
+ final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
return Balancer.run(namenodes, parse(args), conf);
} catch (IOException e) {
System.out.println(e + ". Exiting ...");
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
index 83822e4..c4208b7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java
@@ -21,38 +21,25 @@
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetAddress;
-import java.net.InetSocketAddress;
+import java.net.URI;
import java.util.EnumSet;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB;
-import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.io.IOUtils;
-import org.apache.hadoop.io.retry.RetryPolicies;
-import org.apache.hadoop.io.retry.RetryPolicy;
-import org.apache.hadoop.io.retry.RetryProxy;
-import org.apache.hadoop.ipc.ProtobufRpcEngine;
-import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
-import org.apache.hadoop.net.NetUtils;
-import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Daemon;
@@ -64,7 +51,7 @@
private static final Log LOG = Balancer.LOG;
private static final Path BALANCER_ID_PATH = new Path("/system/balancer.id");
- final InetSocketAddress namenodeAddress;
+ final URI nameNodeUri;
final String blockpoolID;
final NamenodeProtocol namenode;
@@ -78,12 +65,17 @@
private BlockTokenSecretManager blockTokenSecretManager;
private Daemon keyupdaterthread; // AccessKeyUpdater thread
- NameNodeConnector(InetSocketAddress namenodeAddress, Configuration conf
- ) throws IOException {
- this.namenodeAddress = namenodeAddress;
- this.namenode = createNamenode(namenodeAddress, conf);
- this.client = DFSUtil.createNamenode(conf);
- this.fs = FileSystem.get(NameNode.getUri(namenodeAddress), conf);
+ NameNodeConnector(URI nameNodeUri,
+ Configuration conf) throws IOException {
+ this.nameNodeUri = nameNodeUri;
+
+ this.namenode =
+ NameNodeProxies.createProxy(conf, nameNodeUri, NamenodeProtocol.class)
+ .getProxy();
+ this.client =
+ NameNodeProxies.createProxy(conf, nameNodeUri, ClientProtocol.class)
+ .getProxy();
+ this.fs = FileSystem.get(nameNodeUri, conf);
final NamespaceInfo namespaceinfo = namenode.versionRequest();
this.blockpoolID = namespaceinfo.getBlockPoolID();
@@ -188,38 +180,11 @@
@Override
public String toString() {
- return getClass().getSimpleName() + "[namenodeAddress=" + namenodeAddress
+ return getClass().getSimpleName() + "[namenodeUri=" + nameNodeUri
+ ", id=" + blockpoolID
+ "]";
}
- /** Build a NamenodeProtocol connection to the namenode and
- * set up the retry policy
- */
- private static NamenodeProtocol createNamenode(InetSocketAddress address,
- Configuration conf) throws IOException {
- RetryPolicy timeoutPolicy = RetryPolicies.exponentialBackoffRetry(
- 5, 200, TimeUnit.MILLISECONDS);
- Map<Class<? extends Exception>,RetryPolicy> exceptionToPolicyMap =
- new HashMap<Class<? extends Exception>, RetryPolicy>();
- RetryPolicy methodPolicy = RetryPolicies.retryByException(
- timeoutPolicy, exceptionToPolicyMap);
- Map<String,RetryPolicy> methodNameToPolicyMap =
- new HashMap<String, RetryPolicy>();
- methodNameToPolicyMap.put("getBlocks", methodPolicy);
- methodNameToPolicyMap.put("getAccessKeys", methodPolicy);
-
- RPC.setProtocolEngine(conf, NamenodeProtocolPB.class,
- ProtobufRpcEngine.class);
- NamenodeProtocolPB proxy = RPC.getProxy(NamenodeProtocolPB.class,
- RPC.getProtocolVersion(NamenodeProtocolPB.class), address,
- UserGroupInformation.getCurrentUser(), conf,
- NetUtils.getDefaultSocketFactory(conf));
- NamenodeProtocolPB retryProxy = (NamenodeProtocolPB) RetryProxy.create(
- NamenodeProtocolPB.class, proxy, methodNameToPolicyMap);
- return new NamenodeProtocolTranslatorPB(retryProxy);
- }
-
/**
* Periodically updates access keys.
*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java
index 58725a6..ce3ff8b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java
@@ -183,7 +183,7 @@
/**
* Count the number of data-nodes the block belongs to.
*/
- int numNodes() {
+ public int numNodes() {
assert this.triplets != null : "BlockInfo is not initialized";
assert triplets.length % 3 == 0 : "Malformed BlockInfo";
for(int idx = getCapacity()-1; idx >= 0; idx--) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index c8f36a0..1c9b2aa 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -28,6 +28,8 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -48,6 +50,7 @@
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
+import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.common.Util;
@@ -68,6 +71,7 @@
import org.apache.hadoop.util.Daemon;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Sets;
/**
* Keeps information related to the blocks stored in the Hadoop cluster.
@@ -80,17 +84,27 @@
/** Default load factor of map */
public static final float DEFAULT_MAP_LOAD_FACTOR = 0.75f;
+ private static final String QUEUE_REASON_CORRUPT_STATE =
+ "it has the wrong state or generation stamp";
+
+ private static final String QUEUE_REASON_FUTURE_GENSTAMP =
+ "generation stamp is in the future";
+
private final Namesystem namesystem;
private final DatanodeManager datanodeManager;
private final HeartbeatManager heartbeatManager;
private final BlockTokenSecretManager blockTokenSecretManager;
+
+ private final PendingDataNodeMessages pendingDNMessages =
+ new PendingDataNodeMessages();
private volatile long pendingReplicationBlocksCount = 0L;
private volatile long corruptReplicaBlocksCount = 0L;
private volatile long underReplicatedBlocksCount = 0L;
private volatile long scheduledReplicationBlocksCount = 0L;
private volatile long excessBlocksCount = 0L;
+ private volatile long postponedMisreplicatedBlocksCount = 0L;
/** Used by metrics */
public long getPendingReplicationBlocksCount() {
@@ -116,6 +130,14 @@
public long getExcessBlocksCount() {
return excessBlocksCount;
}
+ /** Used by metrics */
+ public long getPostponedMisreplicatedBlocksCount() {
+ return postponedMisreplicatedBlocksCount;
+ }
+ /** Used by metrics */
+ public int getPendingDataNodeMessageCount() {
+ return pendingDNMessages.count();
+ }
/**replicationRecheckInterval is how often namenode checks for new replication work*/
private final long replicationRecheckInterval;
@@ -134,6 +156,15 @@
/** Blocks to be invalidated. */
private final InvalidateBlocks invalidateBlocks;
+
+ /**
+ * After a failover, over-replicated blocks may not be handled
+ * until all of the replicas have done a block report to the
+ * new active. This is to make sure that this NameNode has been
+ * notified of all block deletions that might have been pending
+ * when the failover happened.
+ */
+ private final Set<Block> postponedMisreplicatedBlocks = Sets.newHashSet();
//
// Keeps a TreeSet for every named node. Each treeset contains
@@ -316,49 +347,15 @@
out.println("Metasave: Blocks waiting for replication: " +
neededReplications.size());
for (Block block : neededReplications) {
- List<DatanodeDescriptor> containingNodes =
- new ArrayList<DatanodeDescriptor>();
- List<DatanodeDescriptor> containingLiveReplicasNodes =
- new ArrayList<DatanodeDescriptor>();
-
- NumberReplicas numReplicas = new NumberReplicas();
- // source node returned is not used
- chooseSourceDatanode(block, containingNodes,
- containingLiveReplicasNodes, numReplicas);
- assert containingLiveReplicasNodes.size() == numReplicas.liveReplicas();
- int usableReplicas = numReplicas.liveReplicas() +
- numReplicas.decommissionedReplicas();
-
- if (block instanceof BlockInfo) {
- String fileName = ((BlockInfo)block).getINode().getFullPathName();
- out.print(fileName + ": ");
- }
- // l: == live:, d: == decommissioned c: == corrupt e: == excess
- out.print(block + ((usableReplicas > 0)? "" : " MISSING") +
- " (replicas:" +
- " l: " + numReplicas.liveReplicas() +
- " d: " + numReplicas.decommissionedReplicas() +
- " c: " + numReplicas.corruptReplicas() +
- " e: " + numReplicas.excessReplicas() + ") ");
-
- Collection<DatanodeDescriptor> corruptNodes =
- corruptReplicas.getNodes(block);
-
- for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
- jt.hasNext();) {
- DatanodeDescriptor node = jt.next();
- String state = "";
- if (corruptNodes != null && corruptNodes.contains(node)) {
- state = "(corrupt)";
- } else if (node.isDecommissioned() ||
- node.isDecommissionInProgress()) {
- state = "(decommissioned)";
- }
- out.print(" " + node + state + " : ");
- }
- out.println("");
+ dumpBlockMeta(block, out);
}
}
+
+ // Dump any postponed over-replicated blocks
+ out.println("Mis-replicated blocks that have been postponed:");
+ for (Block block : postponedMisreplicatedBlocks) {
+ dumpBlockMeta(block, out);
+ }
// Dump blocks from pendingReplication
pendingReplications.metaSave(out);
@@ -369,6 +366,58 @@
// Dump all datanodes
getDatanodeManager().datanodeDump(out);
}
+
+ /**
+ * Dump the metadata for the given block in a human-readable
+ * form.
+ */
+ private void dumpBlockMeta(Block block, PrintWriter out) {
+ List<DatanodeDescriptor> containingNodes =
+ new ArrayList<DatanodeDescriptor>();
+ List<DatanodeDescriptor> containingLiveReplicasNodes =
+ new ArrayList<DatanodeDescriptor>();
+
+ NumberReplicas numReplicas = new NumberReplicas();
+ // source node returned is not used
+ chooseSourceDatanode(block, containingNodes,
+ containingLiveReplicasNodes, numReplicas);
+ assert containingLiveReplicasNodes.size() == numReplicas.liveReplicas();
+ int usableReplicas = numReplicas.liveReplicas() +
+ numReplicas.decommissionedReplicas();
+
+ if (block instanceof BlockInfo) {
+ String fileName = ((BlockInfo)block).getINode().getFullPathName();
+ out.print(fileName + ": ");
+ }
+ // l: == live:, d: == decommissioned c: == corrupt e: == excess
+ out.print(block + ((usableReplicas > 0)? "" : " MISSING") +
+ " (replicas:" +
+ " l: " + numReplicas.liveReplicas() +
+ " d: " + numReplicas.decommissionedReplicas() +
+ " c: " + numReplicas.corruptReplicas() +
+ " e: " + numReplicas.excessReplicas() + ") ");
+
+ Collection<DatanodeDescriptor> corruptNodes =
+ corruptReplicas.getNodes(block);
+
+ for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
+ jt.hasNext();) {
+ DatanodeDescriptor node = jt.next();
+ String state = "";
+ if (corruptNodes != null && corruptNodes.contains(node)) {
+ state = "(corrupt)";
+ } else if (node.isDecommissioned() ||
+ node.isDecommissionInProgress()) {
+ state = "(decommissioned)";
+ }
+
+ if (node.areBlockContentsStale()) {
+ state += " (block deletions maybe out of date)";
+ }
+ out.print(" " + node + state + " : ");
+ }
+ out.println("");
+ }
/** @return maxReplicationStreams */
public int getMaxReplicationStreams() {
@@ -425,7 +474,7 @@
final boolean b = commitBlock((BlockInfoUnderConstruction)lastBlock, commitBlock);
if(countNodes(lastBlock).liveReplicas() >= minReplication)
- completeBlock(fileINode,fileINode.numBlocks()-1);
+ completeBlock(fileINode,fileINode.numBlocks()-1, false);
return b;
}
@@ -437,19 +486,15 @@
* of replicas reported from data-nodes.
*/
private BlockInfo completeBlock(final INodeFile fileINode,
- final int blkIndex) throws IOException {
- return completeBlock(fileINode, blkIndex, false);
- }
-
- public BlockInfo completeBlock(final INodeFile fileINode,
- final int blkIndex, final boolean force) throws IOException {
+ final int blkIndex, boolean force) throws IOException {
if(blkIndex < 0)
return null;
BlockInfo curBlock = fileINode.getBlocks()[blkIndex];
if(curBlock.isComplete())
return curBlock;
BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock;
- if(!force && ucBlock.numNodes() < minReplication)
+ int numNodes = ucBlock.numNodes();
+ if (!force && numNodes < minReplication)
throw new IOException("Cannot complete block: " +
"block does not satisfy minimal replication requirement.");
if(!force && ucBlock.getBlockUCState() != BlockUCState.COMMITTED)
@@ -458,20 +503,43 @@
BlockInfo completeBlock = ucBlock.convertToCompleteBlock();
// replace penultimate block in file
fileINode.setBlock(blkIndex, completeBlock);
+
+ // Since safe-mode only counts complete blocks, and we now have
+ // one more complete block, we need to adjust the total up, and
+ // also count it as safe, if we have at least the minimum replica
+ // count. (We may not have the minimum replica count yet if this is
+ // a "forced" completion when a file is getting closed by an
+ // OP_CLOSE edit on the standby).
+ namesystem.adjustSafeModeBlockTotals(0, 1);
+ namesystem.incrementSafeBlockCount(
+ Math.min(numNodes, minReplication));
+
// replace block in the blocksMap
return blocksMap.replaceBlock(completeBlock);
}
private BlockInfo completeBlock(final INodeFile fileINode,
- final BlockInfo block) throws IOException {
+ final BlockInfo block, boolean force) throws IOException {
BlockInfo[] fileBlocks = fileINode.getBlocks();
for(int idx = 0; idx < fileBlocks.length; idx++)
if(fileBlocks[idx] == block) {
- return completeBlock(fileINode, idx);
+ return completeBlock(fileINode, idx, force);
}
return block;
}
+
+ /**
+ * Force the given block in the given file to be marked as complete,
+ * regardless of whether enough replicas are present. This is necessary
+ * when tailing edit logs as a Standby.
+ */
+ public BlockInfo forceCompleteBlock(final INodeFile fileINode,
+ final BlockInfoUnderConstruction block) throws IOException {
+ block.commitBlock(block);
+ return completeBlock(fileINode, block, true);
+ }
+
/**
* Convert the last block of the file to an under construction block.<p>
* The block is converted only if the file has blocks and the last one
@@ -508,6 +576,14 @@
String datanodeId = dd.getStorageID();
invalidateBlocks.remove(datanodeId, oldBlock);
}
+
+ // Adjust safe-mode totals, since under-construction blocks don't
+ // count in safe-mode.
+ namesystem.adjustSafeModeBlockTotals(
+ // decrement safe if we had enough
+ targets.length >= minReplication ? -1 : 0,
+ // always decrement total blocks
+ -1);
final long fileLength = fileINode.computeContentSummary().getLength();
final long pos = fileLength - ucBlock.getNumBytes();
@@ -598,8 +674,8 @@
final boolean isCorrupt = numCorruptNodes == numNodes;
final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes;
final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines];
+ int j = 0;
if (numMachines > 0) {
- int j = 0;
for(Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(blk);
it.hasNext();) {
final DatanodeDescriptor d = it.next();
@@ -608,6 +684,12 @@
machines[j++] = d;
}
}
+ assert j == machines.length :
+ "isCorrupt: " + isCorrupt +
+ " numMachines: " + numMachines +
+ " numNodes: " + numNodes +
+ " numCorrupt: " + numCorruptNodes +
+ " numCorruptRepls: " + numCorruptReplicas;
final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk);
return new LocatedBlock(eb, machines, pos, isCorrupt);
}
@@ -772,6 +854,14 @@
node.resetBlocks();
invalidateBlocks.remove(node.getStorageID());
+
+ // If the DN hasn't block-reported since the most recent
+ // failover, then we may have been holding up on processing
+ // over-replicated blocks because of it. But we can now
+ // process those blocks.
+ if (node.areBlockContentsStale()) {
+ rescanPostponedMisreplicatedBlocks();
+ }
}
/**
@@ -809,22 +899,18 @@
*/
public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk,
final DatanodeInfo dn, String reason) throws IOException {
- namesystem.writeLock();
- try {
- final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock());
- if (storedBlock == null) {
- // Check if the replica is in the blockMap, if not
- // ignore the request for now. This could happen when BlockScanner
- // thread of Datanode reports bad block before Block reports are sent
- // by the Datanode on startup
- NameNode.stateChangeLog.info("BLOCK* findAndMarkBlockAsCorrupt: "
- + blk + " not found.");
- return;
- }
- markBlockAsCorrupt(storedBlock, dn, reason);
- } finally {
- namesystem.writeUnlock();
+ assert namesystem.hasWriteLock();
+ final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock());
+ if (storedBlock == null) {
+ // Check if the replica is in the blockMap, if not
+ // ignore the request for now. This could happen when BlockScanner
+ // thread of Datanode reports bad block before Block reports are sent
+ // by the Datanode on startup
+ NameNode.stateChangeLog.info("BLOCK* findAndMarkBlockAsCorrupt: "
+ + blk + " not found.");
+ return;
}
+ markBlockAsCorrupt(storedBlock, dn, reason);
}
private void markBlockAsCorrupt(BlockInfo storedBlock,
@@ -876,10 +962,17 @@
+ " because datanode " + dn.getName() + " does not exist.");
}
- // Check how many copies we have of the block. If we have at least one
- // copy on a live node, then we can delete it.
- int count = countNodes(blk).liveReplicas();
- if (count >= 1) {
+ // Check how many copies we have of the block
+ NumberReplicas nr = countNodes(blk);
+ if (nr.replicasOnStaleNodes() > 0) {
+ NameNode.stateChangeLog.info("BLOCK* invalidateBlocks: postponing " +
+ "invalidation of block " + blk + " on " + dn + " because " +
+ nr.replicasOnStaleNodes() + " replica(s) are located on nodes " +
+ "with potentially out-of-date block reports.");
+ postponeBlock(blk);
+
+ } else if (nr.liveReplicas() >= 1) {
+ // If we have at least one copy on a live node, then we can delete it.
addToInvalidates(blk, dn);
removeStoredBlock(blk, node);
if(NameNode.stateChangeLog.isDebugEnabled()) {
@@ -892,6 +985,13 @@
}
}
+ private void postponeBlock(Block blk) {
+ if (postponedMisreplicatedBlocks.add(blk)) {
+ postponedMisreplicatedBlocksCount++;
+ }
+ }
+
+
void updateState() {
pendingReplicationBlocksCount = pendingReplications.size();
underReplicatedBlocksCount = neededReplications.size();
@@ -930,7 +1030,7 @@
*
* @return number of blocks scheduled for replication during this iteration.
*/
- private int computeReplicationWork(int blocksToProcess) throws IOException {
+ int computeReplicationWork(int blocksToProcess) throws IOException {
List<List<Block>> blocksToReplicate = null;
namesystem.writeLock();
try {
@@ -981,8 +1081,10 @@
NumberReplicas numReplicas = new NumberReplicas();
srcNode = chooseSourceDatanode(
block, containingNodes, liveReplicaNodes, numReplicas);
- if(srcNode == null) // block can not be replicated from any node
+ if(srcNode == null) { // block can not be replicated from any node
+ LOG.debug("Block " + block + " cannot be repl from any node");
continue;
+ }
assert liveReplicaNodes.size() == numReplicas.liveReplicas();
// do not schedule more if enough replicas is already pending
@@ -1232,7 +1334,7 @@
srcNode = node;
}
if(numReplicas != null)
- numReplicas.initialize(live, decommissioned, corrupt, excess);
+ numReplicas.initialize(live, decommissioned, corrupt, excess, 0);
return srcNode;
}
@@ -1314,7 +1416,7 @@
// To minimize startup time, we discard any second (or later) block reports
// that we receive while still in startup phase.
- if (namesystem.isInStartupSafeMode() && node.numBlocks() > 0) {
+ if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) {
NameNode.stateChangeLog.info("BLOCK* processReport: "
+ "discarded non-initial block report from " + nodeID.getName()
+ " because namenode still in startup phase");
@@ -1328,6 +1430,19 @@
} else {
processReport(node, newReport);
}
+
+ // Now that we have an up-to-date block report, we know that any
+ // deletions from a previous NN iteration have been accounted for.
+ boolean staleBefore = node.areBlockContentsStale();
+ node.receivedBlockReport();
+ if (staleBefore && !node.areBlockContentsStale()) {
+ LOG.info("BLOCK* processReport: " +
+ "Received first block report from " + node +
+ " after becoming active. Its block contents are no longer" +
+ " considered stale.");
+ rescanPostponedMisreplicatedBlocks();
+ }
+
} finally {
endTime = Util.now();
namesystem.writeUnlock();
@@ -1340,6 +1455,37 @@
+ ", processing time: " + (endTime - startTime) + " msecs");
}
+ /**
+ * Rescan the list of blocks which were previously postponed.
+ */
+ private void rescanPostponedMisreplicatedBlocks() {
+ for (Iterator<Block> it = postponedMisreplicatedBlocks.iterator();
+ it.hasNext();) {
+ Block b = it.next();
+
+ BlockInfo bi = blocksMap.getStoredBlock(b);
+ if (bi == null) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
+ "Postponed mis-replicated block " + b + " no longer found " +
+ "in block map.");
+ }
+ it.remove();
+ postponedMisreplicatedBlocksCount--;
+ continue;
+ }
+ MisReplicationResult res = processMisReplicatedBlock(bi);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
+ "Re-scanned block " + b + ", result is " + res);
+ }
+ if (res != MisReplicationResult.POSTPONE) {
+ it.remove();
+ postponedMisreplicatedBlocksCount--;
+ }
+ }
+ }
+
private void processReport(final DatanodeDescriptor node,
final BlockListAsLongs report) throws IOException {
// Normal case:
@@ -1392,9 +1538,19 @@
assert (node.numBlocks() == 0);
BlockReportIterator itBR = report.getBlockReportIterator();
+ boolean isStandby = namesystem.isInStandbyState();
+
while(itBR.hasNext()) {
Block iblk = itBR.next();
ReplicaState reportedState = itBR.getCurrentReplicaState();
+
+ if (isStandby &&
+ namesystem.isGenStampInFuture(iblk.getGenerationStamp())) {
+ queueReportedBlock(node, iblk, reportedState,
+ QUEUE_REASON_FUTURE_GENSTAMP);
+ continue;
+ }
+
BlockInfo storedBlock = blocksMap.getStoredBlock(iblk);
// If block does not belong to any file, we are done.
if (storedBlock == null) continue;
@@ -1404,7 +1560,14 @@
BlockToMarkCorrupt c = checkReplicaCorrupt(
iblk, reportedState, storedBlock, ucState, node);
if (c != null) {
- markBlockAsCorrupt(c.blockInfo, node, c.reason);
+ if (namesystem.isInStandbyState()) {
+ // In the Standby, we may receive a block report for a file that we
+ // just have an out-of-date gen-stamp or state for, for example.
+ queueReportedBlock(node, iblk, reportedState,
+ QUEUE_REASON_CORRUPT_STATE);
+ } else {
+ markBlockAsCorrupt(c.blockInfo, node, c.reason);
+ }
continue;
}
@@ -1487,7 +1650,8 @@
* @param toCorrupt replicas with unexpected length or generation stamp;
* add to corrupt replicas
* @param toUC replicas of blocks currently under construction
- * @return
+ * @return the up-to-date stored block, if it should be kept.
+ * Otherwise, null.
*/
private BlockInfo processReportedBlock(final DatanodeDescriptor dn,
final Block block, final ReplicaState reportedState,
@@ -1502,6 +1666,13 @@
+ " replicaState = " + reportedState);
}
+ if (namesystem.isInStandbyState() &&
+ namesystem.isGenStampInFuture(block.getGenerationStamp())) {
+ queueReportedBlock(dn, block, reportedState,
+ QUEUE_REASON_FUTURE_GENSTAMP);
+ return null;
+ }
+
// find block by blockId
BlockInfo storedBlock = blocksMap.getStoredBlock(block);
if(storedBlock == null) {
@@ -1519,15 +1690,24 @@
// Ignore replicas already scheduled to be removed from the DN
if(invalidateBlocks.contains(dn.getStorageID(), block)) {
- assert storedBlock.findDatanode(dn) < 0 : "Block " + block
- + " in invalidated blocks set should not appear in DN " + dn;
+/* TODO: following assertion is incorrect, see HDFS-2668
+assert storedBlock.findDatanode(dn) < 0 : "Block " + block
+ + " in recentInvalidatesSet should not appear in DN " + dn; */
return storedBlock;
}
BlockToMarkCorrupt c = checkReplicaCorrupt(
block, reportedState, storedBlock, ucState, dn);
if (c != null) {
- toCorrupt.add(c);
+ if (namesystem.isInStandbyState()) {
+ // If the block is an out-of-date generation stamp or state,
+ // but we're the standby, we shouldn't treat it as corrupt,
+ // but instead just queue it for later processing.
+ queueReportedBlock(dn, storedBlock, reportedState,
+ QUEUE_REASON_CORRUPT_STATE);
+ } else {
+ toCorrupt.add(c);
+ }
return storedBlock;
}
@@ -1545,6 +1725,68 @@
return storedBlock;
}
+ /**
+ * Queue the given reported block for later processing in the
+ * standby node. {@see PendingDataNodeMessages}.
+ * @param reason a textual reason to report in the debug logs
+ */
+ private void queueReportedBlock(DatanodeDescriptor dn, Block block,
+ ReplicaState reportedState, String reason) {
+ assert namesystem.isInStandbyState();
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Queueing reported block " + block +
+ " in state " + reportedState +
+ " from datanode " + dn + " for later processing " +
+ "because " + reason + ".");
+ }
+ pendingDNMessages.enqueueReportedBlock(dn, block, reportedState);
+ }
+
+ /**
+ * Try to process any messages that were previously queued for the given
+ * block. This is called from FSEditLogLoader whenever a block's state
+ * in the namespace has changed or a new block has been created.
+ */
+ public void processQueuedMessagesForBlock(Block b) throws IOException {
+ Queue<ReportedBlockInfo> queue = pendingDNMessages.takeBlockQueue(b);
+ if (queue == null) {
+ // Nothing to re-process
+ return;
+ }
+ processQueuedMessages(queue);
+ }
+
+ private void processQueuedMessages(Iterable<ReportedBlockInfo> rbis)
+ throws IOException {
+ for (ReportedBlockInfo rbi : rbis) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Processing previouly queued message " + rbi);
+ }
+ processAndHandleReportedBlock(
+ rbi.getNode(), rbi.getBlock(), rbi.getReportedState(), null);
+ }
+ }
+
+ /**
+ * Process any remaining queued datanode messages after entering
+ * active state. At this point they will not be re-queued since
+ * we are the definitive master node and thus should be up-to-date
+ * with the namespace information.
+ */
+ public void processAllPendingDNMessages() throws IOException {
+ assert !namesystem.isInStandbyState() :
+ "processAllPendingDNMessages() should be called after exiting " +
+ "standby state!";
+ int count = pendingDNMessages.count();
+ if (count > 0) {
+ LOG.info("Processing " + count + " messages from DataNodes " +
+ "that were previously queued during standby state.");
+ }
+ processQueuedMessages(pendingDNMessages.takeAll());
+ assert pendingDNMessages.count() == 0;
+ }
+
/*
* The next two methods test the various cases under which we must conclude
* the replica is corrupt, or under construction. These are laid out
@@ -1675,13 +1917,15 @@
// Now check for completion of blocks and safe block count
int numCurrentReplica = countLiveNodes(storedBlock);
if (storedBlock.getBlockUCState() == BlockUCState.COMMITTED
- && numCurrentReplica >= minReplication)
- storedBlock = completeBlock(storedBlock.getINode(), storedBlock);
-
- // check whether safe replication is reached for the block
- // only complete blocks are counted towards that
- if(storedBlock.isComplete())
+ && numCurrentReplica >= minReplication) {
+ completeBlock(storedBlock.getINode(), storedBlock, false);
+ } else if (storedBlock.isComplete()) {
+ // check whether safe replication is reached for the block
+ // only complete blocks are counted towards that.
+ // In the case that the block just became complete above, completeBlock()
+ // handles the safe block count maintenance.
namesystem.incrementSafeBlockCount(numCurrentReplica);
+ }
}
/**
@@ -1738,15 +1982,17 @@
+ pendingReplications.getNumReplicas(storedBlock);
if(storedBlock.getBlockUCState() == BlockUCState.COMMITTED &&
- numLiveReplicas >= minReplication)
- storedBlock = completeBlock(fileINode, storedBlock);
-
- // check whether safe replication is reached for the block
- // only complete blocks are counted towards that
- // Is no-op if not in safe mode.
- if(storedBlock.isComplete())
+ numLiveReplicas >= minReplication) {
+ storedBlock = completeBlock(fileINode, storedBlock, false);
+ } else if (storedBlock.isComplete()) {
+ // check whether safe replication is reached for the block
+ // only complete blocks are counted towards that
+ // Is no-op if not in safe mode.
+ // In the case that the block just became complete above, completeBlock()
+ // handles the safe block count maintenance.
namesystem.incrementSafeBlockCount(numCurrentReplica);
-
+ }
+
// if file is under construction, then done for now
if (fileINode.isUnderConstruction()) {
return storedBlock;
@@ -1839,49 +2085,93 @@
public void processMisReplicatedBlocks() {
assert namesystem.hasWriteLock();
- long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0,
+ long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0, nrPostponed = 0,
nrUnderConstruction = 0;
neededReplications.clear();
for (BlockInfo block : blocksMap.getBlocks()) {
- INodeFile fileINode = block.getINode();
- if (fileINode == null) {
- // block does not belong to any file
- nrInvalid++;
- addToInvalidates(block);
- continue;
+ MisReplicationResult res = processMisReplicatedBlock(block);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("block " + block + ": " + res);
}
- if (!block.isComplete()) {
- // Incomplete blocks are never considered mis-replicated --
- // they'll be reached when they are completed or recovered.
- nrUnderConstruction++;
- continue;
- }
- // calculate current replication
- short expectedReplication = fileINode.getReplication();
- NumberReplicas num = countNodes(block);
- int numCurrentReplica = num.liveReplicas();
- // add to under-replicated queue if need to be
- if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
- if (neededReplications.add(block, numCurrentReplica, num
- .decommissionedReplicas(), expectedReplication)) {
- nrUnderReplicated++;
- }
- }
-
- if (numCurrentReplica > expectedReplication) {
- // over-replicated block
+ switch (res) {
+ case UNDER_REPLICATED:
+ nrUnderReplicated++;
+ break;
+ case OVER_REPLICATED:
nrOverReplicated++;
- processOverReplicatedBlock(block, expectedReplication, null, null);
+ break;
+ case INVALID:
+ nrInvalid++;
+ break;
+ case POSTPONE:
+ nrPostponed++;
+ postponeBlock(block);
+ break;
+ case UNDER_CONSTRUCTION:
+ nrUnderConstruction++;
+ break;
+ case OK:
+ break;
+ default:
+ throw new AssertionError("Invalid enum value: " + res);
}
}
-
+
LOG.info("Total number of blocks = " + blocksMap.size());
LOG.info("Number of invalid blocks = " + nrInvalid);
LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
- LOG.info("Number of over-replicated blocks = " + nrOverReplicated);
+ LOG.info("Number of over-replicated blocks = " + nrOverReplicated +
+ ((nrPostponed > 0) ? ( " (" + nrPostponed + " postponed)") : ""));
LOG.info("Number of blocks being written = " + nrUnderConstruction);
}
+ /**
+ * Process a single possibly misreplicated block. This adds it to the
+ * appropriate queues if necessary, and returns a result code indicating
+ * what happened with it.
+ */
+ private MisReplicationResult processMisReplicatedBlock(BlockInfo block) {
+ INodeFile fileINode = block.getINode();
+ if (fileINode == null) {
+ // block does not belong to any file
+ addToInvalidates(block);
+ return MisReplicationResult.INVALID;
+ }
+ if (!block.isComplete()) {
+ // Incomplete blocks are never considered mis-replicated --
+ // they'll be reached when they are completed or recovered.
+ return MisReplicationResult.UNDER_CONSTRUCTION;
+ }
+ // calculate current replication
+ short expectedReplication = fileINode.getReplication();
+ NumberReplicas num = countNodes(block);
+ int numCurrentReplica = num.liveReplicas();
+ // add to under-replicated queue if need to be
+ if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
+ if (neededReplications.add(block, numCurrentReplica, num
+ .decommissionedReplicas(), expectedReplication)) {
+ return MisReplicationResult.UNDER_REPLICATED;
+ }
+ }
+
+ if (numCurrentReplica > expectedReplication) {
+ if (num.replicasOnStaleNodes() > 0) {
+ // If any of the replicas of this block are on nodes that are
+ // considered "stale", then these replicas may in fact have
+ // already been deleted. So, we cannot safely act on the
+ // over-replication until a later point in time, when
+ // the "stale" nodes have block reported.
+ return MisReplicationResult.POSTPONE;
+ }
+
+ // over-replicated block
+ processOverReplicatedBlock(block, expectedReplication, null, null);
+ return MisReplicationResult.OVER_REPLICATED;
+ }
+
+ return MisReplicationResult.OK;
+ }
+
/** Set replication for the blocks. */
public void setReplication(final short oldRepl, final short newRepl,
final String src, final Block... blocks) throws IOException {
@@ -1925,6 +2215,14 @@
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
it.hasNext();) {
DatanodeDescriptor cur = it.next();
+ if (cur.areBlockContentsStale()) {
+ LOG.info("BLOCK* processOverReplicatedBlock: " +
+ "Postponing processing of over-replicated block " +
+ block + " since datanode " + cur + " does not yet have up-to-date " +
+ "block information.");
+ postponeBlock(block);
+ return;
+ }
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(cur
.getStorageID());
if (excessBlocks == null || !excessBlocks.contains(block)) {
@@ -2151,13 +2449,19 @@
// Modify the blocks->datanode map and node's map.
//
pendingReplications.remove(block);
-
+ processAndHandleReportedBlock(node, block, ReplicaState.FINALIZED,
+ delHintNode);
+ }
+
+ private void processAndHandleReportedBlock(DatanodeDescriptor node, Block block,
+ ReplicaState reportedState, DatanodeDescriptor delHintNode)
+ throws IOException {
// blockReceived reports a finalized block
Collection<BlockInfo> toAdd = new LinkedList<BlockInfo>();
Collection<Block> toInvalidate = new LinkedList<Block>();
Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
- processReportedBlock(node, block, ReplicaState.FINALIZED,
+ processReportedBlock(node, block, reportedState,
toAdd, toInvalidate, toCorrupt, toUC);
// the block is only in one of the to-do lists
// if it is in none then data-node already has it
@@ -2181,59 +2485,80 @@
}
}
- /** The given node is reporting that it received/deleted certain blocks. */
- public void blockReceivedAndDeleted(final DatanodeID nodeID,
+ /**
+ * The given node is reporting incremental information about some blocks.
+ * This includes blocks that are starting to be received, completed being
+ * received, or deleted.
+ */
+ public void processIncrementalBlockReport(final DatanodeID nodeID,
final String poolId,
- final ReceivedDeletedBlockInfo receivedAndDeletedBlocks[]
+ final ReceivedDeletedBlockInfo blockInfos[]
) throws IOException {
namesystem.writeLock();
int received = 0;
int deleted = 0;
+ int receiving = 0;
try {
final DatanodeDescriptor node = datanodeManager.getDatanode(nodeID);
if (node == null || !node.isAlive) {
NameNode.stateChangeLog
- .warn("BLOCK* blockReceivedDeleted"
+ .warn("BLOCK* processIncrementalBlockReport"
+ " is received from dead or unregistered node "
+ nodeID.getName());
throw new IOException(
- "Got blockReceivedDeleted message from unregistered or dead node");
+ "Got incremental block report from unregistered or dead node");
}
- for (int i = 0; i < receivedAndDeletedBlocks.length; i++) {
- if (receivedAndDeletedBlocks[i].isDeletedBlock()) {
- removeStoredBlock(
- receivedAndDeletedBlocks[i].getBlock(), node);
+ for (ReceivedDeletedBlockInfo rdbi : blockInfos) {
+ switch (rdbi.getStatus()) {
+ case DELETED_BLOCK:
+ removeStoredBlock(rdbi.getBlock(), node);
deleted++;
- } else {
- addBlock(node, receivedAndDeletedBlocks[i].getBlock(),
- receivedAndDeletedBlocks[i].getDelHints());
+ break;
+ case RECEIVED_BLOCK:
+ addBlock(node, rdbi.getBlock(), rdbi.getDelHints());
received++;
+ break;
+ case RECEIVING_BLOCK:
+ receiving++;
+ processAndHandleReportedBlock(node, rdbi.getBlock(),
+ ReplicaState.RBW, null);
+ break;
+ default:
+ String msg =
+ "Unknown block status code reported by " + nodeID.getName() +
+ ": " + rdbi;
+ NameNode.stateChangeLog.warn(msg);
+ assert false : msg; // if assertions are enabled, throw.
+ break;
}
if (NameNode.stateChangeLog.isDebugEnabled()) {
- NameNode.stateChangeLog.debug("BLOCK* block"
- + (receivedAndDeletedBlocks[i].isDeletedBlock() ? "Deleted"
- : "Received") + ": " + receivedAndDeletedBlocks[i].getBlock()
+ NameNode.stateChangeLog.debug("BLOCK* block "
+ + (rdbi.getStatus()) + ": " + rdbi.getBlock()
+ " is received from " + nodeID.getName());
}
}
} finally {
namesystem.writeUnlock();
NameNode.stateChangeLog
- .debug("*BLOCK* NameNode.blockReceivedAndDeleted: " + "from "
- + nodeID.getName() + " received: " + received + ", "
+ .debug("*BLOCK* NameNode.processIncrementalBlockReport: " + "from "
+ + nodeID.getName()
+ + " receiving: " + receiving + ", "
+ + " received: " + received + ", "
+ " deleted: " + deleted);
}
}
/**
- * Return the number of nodes that are live and decommissioned.
+ * Return the number of nodes hosting a given block, grouped
+ * by the state of those replicas.
*/
public NumberReplicas countNodes(Block b) {
- int count = 0;
+ int decommissioned = 0;
int live = 0;
int corrupt = 0;
int excess = 0;
+ int stale = 0;
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
while (nodeIter.hasNext()) {
@@ -2241,7 +2566,7 @@
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) {
corrupt++;
} else if (node.isDecommissionInProgress() || node.isDecommissioned()) {
- count++;
+ decommissioned++;
} else {
LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.get(node
.getStorageID());
@@ -2251,8 +2576,11 @@
live++;
}
}
+ if (node.areBlockContentsStale()) {
+ stale++;
+ }
}
- return new NumberReplicas(live, count, corrupt, excess);
+ return new NumberReplicas(live, decommissioned, corrupt, excess, stale);
}
/**
@@ -2379,7 +2707,7 @@
}
public int getActiveBlockCount() {
- return blocksMap.size() - (int)invalidateBlocks.numBlocks();
+ return blocksMap.size();
}
public DatanodeDescriptor[] getNodes(BlockInfo block) {
@@ -2397,10 +2725,17 @@
}
public void removeBlock(Block block) {
+ assert namesystem.hasWriteLock();
+ // No need to ACK blocks that are being removed entirely
+ // from the namespace, since the removal of the associated
+ // file already removes them from the block map below.
block.setNumBytes(BlockCommand.NO_ACK);
addToInvalidates(block);
corruptReplicas.removeFromCorruptReplicasMap(block);
blocksMap.removeBlock(block);
+ if (postponedMisreplicatedBlocks.remove(block)) {
+ postponedMisreplicatedBlocksCount--;
+ }
}
public BlockInfo getStoredBlock(Block block) {
@@ -2412,6 +2747,9 @@
final int curReplicasDelta, int expectedReplicasDelta) {
namesystem.writeLock();
try {
+ if (!namesystem.isPopulatingReplQueues()) {
+ return;
+ }
NumberReplicas repl = countNodes(block);
int curExpectedReplicas = getReplication(block);
if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) {
@@ -2461,8 +2799,10 @@
namesystem.writeLock();
try {
// blocks should not be replicated or removed if safe mode is on
- if (namesystem.isInSafeMode())
+ if (namesystem.isInSafeMode()) {
+ LOG.debug("In safemode, not computing replication work");
return 0;
+ }
// get blocks to invalidate for the nodeId
assert nodeId != null;
return invalidateBlocks.invalidateWork(nodeId);
@@ -2645,6 +2985,19 @@
return workFound;
}
+ /**
+ * Clear all queues that hold decisions previously made by
+ * this NameNode.
+ */
+ public void clearQueues() {
+ neededReplications.clear();
+ pendingReplications.clear();
+ excessReplicateMap.clear();
+ invalidateBlocks.clear();
+ datanodeManager.clearPendingQueues();
+ };
+
+
private static class ReplicationWork {
private Block block;
@@ -2675,4 +3028,24 @@
this.targets = null;
}
}
+
+ /**
+ * A simple result enum for the result of
+ * {@link BlockManager#processMisReplicatedBlock(BlockInfo)}.
+ */
+ enum MisReplicationResult {
+ /** The block should be invalidated since it belongs to a deleted file. */
+ INVALID,
+ /** The block is currently under-replicated. */
+ UNDER_REPLICATED,
+ /** The block is currently over-replicated. */
+ OVER_REPLICATED,
+ /** A decision can't currently be made about this block. */
+ POSTPONE,
+ /** The block is under construction, so should be ignored */
+ UNDER_CONSTRUCTION,
+ /** The block is properly replicated */
+ OK
+ }
+
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
index ac1a7e6..058d2e3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockPlacementPolicyDefault.java
@@ -63,7 +63,7 @@
initialize(conf, stats, clusterMap);
}
- BlockPlacementPolicyDefault() {
+ protected BlockPlacementPolicyDefault() {
}
@Override
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
index d927f05..984456f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeDescriptor.java
@@ -94,6 +94,10 @@
boolean contains(E e) {
return blockq.contains(e);
}
+
+ synchronized void clear() {
+ blockq.clear();
+ }
}
private volatile BlockInfo blockList = null;
@@ -103,6 +107,24 @@
public boolean isAlive = false;
public boolean needKeyUpdate = false;
+ /**
+ * Set to false on any NN failover, and reset to true
+ * whenever a block report is received.
+ */
+ private boolean heartbeatedSinceFailover = false;
+
+ /**
+ * At startup or at any failover, the DNs in the cluster may
+ * have pending block deletions from a previous incarnation
+ * of the NameNode. Thus, we consider their block contents
+ * stale until we have received a block report. When a DN
+ * is considered stale, any replicas on it are transitively
+ * considered stale. If any block has at least one stale replica,
+ * then no invalidations will be processed for this block.
+ * See HDFS-1972.
+ */
+ private boolean blockContentsStale = true;
+
// A system administrator can tune the balancer bandwidth parameter
// (dfs.balance.bandwidthPerSec) dynamically by calling
// "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
@@ -129,6 +151,10 @@
private long lastBlocksScheduledRollTime = 0;
private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
private int volumeFailures = 0;
+
+ /** Set to false after processing first block report */
+ private boolean firstBlockReport = true;
+
/**
* When set to true, the node is not in include list and is not allowed
* to communicate with the namenode
@@ -281,6 +307,14 @@
this.invalidateBlocks.clear();
this.volumeFailures = 0;
}
+
+ public void clearBlockQueues() {
+ synchronized (invalidateBlocks) {
+ this.invalidateBlocks.clear();
+ this.recoverBlocks.clear();
+ this.replicateBlocks.clear();
+ }
+ }
public int numBlocks() {
return numBlocks;
@@ -298,6 +332,7 @@
this.lastUpdate = System.currentTimeMillis();
this.xceiverCount = xceiverCount;
this.volumeFailures = volFailures;
+ this.heartbeatedSinceFailover = true;
rollBlocksScheduled(lastUpdate);
}
@@ -564,5 +599,41 @@
this.bandwidth = bandwidth;
}
+ public boolean areBlockContentsStale() {
+ return blockContentsStale;
+ }
+ public void markStaleAfterFailover() {
+ heartbeatedSinceFailover = false;
+ blockContentsStale = true;
+ }
+
+ public void receivedBlockReport() {
+ if (heartbeatedSinceFailover) {
+ blockContentsStale = false;
+ }
+ firstBlockReport = false;
+ }
+
+ boolean isFirstBlockReport() {
+ return firstBlockReport;
+ }
+
+ @Override
+ public String dumpDatanode() {
+ StringBuilder sb = new StringBuilder(super.dumpDatanode());
+ int repl = replicateBlocks.size();
+ if (repl > 0) {
+ sb.append(" ").append(repl).append(" blocks to be replicated;");
+ }
+ int inval = invalidateBlocks.size();
+ if (inval > 0) {
+ sb.append(" ").append(inval).append(" blocks to be invalidated;");
+ }
+ int recover = recoverBlocks.size();
+ if (recover > 0) {
+ sb.append(" ").append(recover).append(" blocks to be recovered;");
+ }
+ return sb.toString();
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
index 5d795e7..8c59ccb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
@@ -923,7 +923,7 @@
}
}
- return null;
+ return new DatanodeCommand[0];
}
/**
@@ -947,4 +947,27 @@
}
}
}
+
+ public void markAllDatanodesStale() {
+ LOG.info("Marking all datandoes as stale");
+ synchronized (datanodeMap) {
+ for (DatanodeDescriptor dn : datanodeMap.values()) {
+ dn.markStaleAfterFailover();
+ }
+ }
+ }
+
+ /**
+ * Clear any actions that are queued up to be sent to the DNs
+ * on their next heartbeats. This includes block invalidations,
+ * recoveries, and replication requests.
+ */
+ public void clearPendingQueues() {
+ synchronized (datanodeMap) {
+ for (DatanodeDescriptor dn : datanodeMap.values()) {
+ dn.clearBlockQueues();
+ }
+ }
+ }
+
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java
index 2c6b46f..5c7e0bd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/InvalidateBlocks.java
@@ -160,4 +160,9 @@
numBlocks -= toInvalidate.size();
return toInvalidate;
}
+
+ synchronized void clear() {
+ node2blocks.clear();
+ numBlocks = 0;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java
index 52f6258..9e5c8df 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/NumberReplicas.java
@@ -26,20 +26,22 @@
private int decommissionedReplicas;
private int corruptReplicas;
private int excessReplicas;
+ private int replicasOnStaleNodes;
NumberReplicas() {
- initialize(0, 0, 0, 0);
+ initialize(0, 0, 0, 0, 0);
}
- NumberReplicas(int live, int decommissioned, int corrupt, int excess) {
- initialize(live, decommissioned, corrupt, excess);
+ NumberReplicas(int live, int decommissioned, int corrupt, int excess, int stale) {
+ initialize(live, decommissioned, corrupt, excess, stale);
}
- void initialize(int live, int decommissioned, int corrupt, int excess) {
+ void initialize(int live, int decommissioned, int corrupt, int excess, int stale) {
liveReplicas = live;
decommissionedReplicas = decommissioned;
corruptReplicas = corrupt;
excessReplicas = excess;
+ replicasOnStaleNodes = stale;
}
public int liveReplicas() {
@@ -54,4 +56,13 @@
public int excessReplicas() {
return excessReplicas;
}
+
+ /**
+ * @return the number of replicas which are on stale nodes.
+ * This is not mutually exclusive with the other counts -- ie a
+ * replica may count as both "live" and "stale".
+ */
+ public int replicasOnStaleNodes() {
+ return replicasOnStaleNodes;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java
new file mode 100644
index 0000000..b7da116
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingDataNodeMessages.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.blockmanagement;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * In the Standby Node, we can receive messages about blocks
+ * before they are actually available in the namespace, or while
+ * they have an outdated state in the namespace. In those cases,
+ * we queue those block-related messages in this structure.
+ * */
+class PendingDataNodeMessages {
+
+ Map<Block, Queue<ReportedBlockInfo>> queueByBlockId =
+ Maps.newHashMap();
+ private int count = 0;
+
+
+ static class ReportedBlockInfo {
+ private final Block block;
+ private final DatanodeDescriptor dn;
+ private final ReplicaState reportedState;
+
+ ReportedBlockInfo(DatanodeDescriptor dn, Block block,
+ ReplicaState reportedState) {
+ this.dn = dn;
+ this.block = block;
+ this.reportedState = reportedState;
+ }
+
+ Block getBlock() {
+ return block;
+ }
+
+ DatanodeDescriptor getNode() {
+ return dn;
+ }
+
+ ReplicaState getReportedState() {
+ return reportedState;
+ }
+
+ @Override
+ public String toString() {
+ return "ReportedBlockInfo [block=" + block + ", dn=" + dn
+ + ", reportedState=" + reportedState + "]";
+ }
+ }
+
+ void enqueueReportedBlock(DatanodeDescriptor dn, Block block,
+ ReplicaState reportedState) {
+ block = new Block(block);
+ getBlockQueue(block).add(
+ new ReportedBlockInfo(dn, block, reportedState));
+ count++;
+ }
+
+ /**
+ * @return any messages that were previously queued for the given block,
+ * or null if no messages were queued.
+ */
+ Queue<ReportedBlockInfo> takeBlockQueue(Block block) {
+ Queue<ReportedBlockInfo> queue = queueByBlockId.remove(block);
+ if (queue != null) {
+ count -= queue.size();
+ }
+ return queue;
+ }
+
+
+ private Queue<ReportedBlockInfo> getBlockQueue(Block block) {
+ Queue<ReportedBlockInfo> queue = queueByBlockId.get(block);
+ if (queue == null) {
+ queue = Lists.newLinkedList();
+ queueByBlockId.put(block, queue);
+ }
+ return queue;
+ }
+
+ public int count() {
+ return count ;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (Map.Entry<Block, Queue<ReportedBlockInfo>> entry :
+ queueByBlockId.entrySet()) {
+ sb.append("Block " + entry.getKey() + ":\n");
+ for (ReportedBlockInfo rbi : entry.getValue()) {
+ sb.append(" ").append(rbi).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+
+ public Iterable<ReportedBlockInfo> takeAll() {
+ List<ReportedBlockInfo> rbis = Lists.newArrayListWithCapacity(
+ count);
+ for (Queue<ReportedBlockInfo> q : queueByBlockId.values()) {
+ rbis.addAll(q);
+ }
+ queueByBlockId.clear();
+ count = 0;
+ return rbis;
+ }
+}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
index e07cf9b..e200ed0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java
@@ -104,6 +104,14 @@
}
}
+
+ public void clear() {
+ synchronized (pendingReplications) {
+ pendingReplications.clear();
+ timedOutItems.clear();
+ }
+ }
+
/**
* The total number of blocks that are undergoing replication
*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
index 3de9067..c76d24c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Storage.java
@@ -568,7 +568,7 @@
* <p> Locking is not supported by all file systems.
* E.g., NFS does not consistently support exclusive locks.
*
- * <p> If locking is supported we guarantee exculsive access to the
+ * <p> If locking is supported we guarantee exclusive access to the
* storage directory. Otherwise, no guarantee is given.
*
* @throws IOException if locking fails
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java
index 642551e..1f4e974 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/common/Util.java
@@ -23,6 +23,7 @@
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -97,9 +98,9 @@
* @param names collection of strings to convert to URIs
* @return collection of URIs
*/
- public static Collection<URI> stringCollectionAsURIs(
+ public static List<URI> stringCollectionAsURIs(
Collection<String> names) {
- Collection<URI> uris = new ArrayList<URI>(names.size());
+ List<URI> uris = new ArrayList<URI>(names.size());
for(String name : names) {
try {
uris.add(stringAsURI(name));
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
index 5b1ed7c..27567b5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPOfferService.java
@@ -17,20 +17,16 @@
*/
package org.apache.hadoop.hdfs.server.datanode;
-import static org.apache.hadoop.hdfs.server.common.Util.now;
-
import java.io.IOException;
import java.net.InetSocketAddress;
-import java.net.SocketTimeoutException;
-import java.net.URI;
-import java.util.Collection;
-import java.util.LinkedList;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.Block;
-import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@@ -50,8 +46,11 @@
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand;
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
+import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo.BlockStatus;
+import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
@@ -61,23 +60,22 @@
import org.apache.hadoop.util.StringUtils;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
/**
- * A thread per namenode to perform:
- * <ul>
- * <li> Pre-registration handshake with namenode</li>
- * <li> Registration with namenode</li>
- * <li> Send periodic heartbeats to the namenode</li>
- * <li> Handle commands received from the namenode</li>
- * </ul>
+ * One instance per block-pool/namespace on the DN, which handles the
+ * heartbeats to the active and standby NNs for that namespace.
+ * This class manages an instance of {@link BPServiceActor} for each NN,
+ * and delegates calls to both NNs.
+ * It also maintains the state about which of the NNs is considered active.
*/
@InterfaceAudience.Private
-class BPOfferService implements Runnable {
+class BPOfferService {
static final Log LOG = DataNode.LOG;
- final InetSocketAddress nnAddr;
-
/**
* Information about the namespace that this service
* is registering with. This is assigned after
@@ -92,43 +90,80 @@
*/
DatanodeRegistration bpRegistration;
- long lastBlockReport = 0;
- long lastDeletedReport = 0;
-
- boolean resetBlockReportTime = true;
-
- Thread bpThread;
- DatanodeProtocolClientSideTranslatorPB bpNamenode;
- private long lastHeartbeat = 0;
- private volatile boolean initialized = false;
- private final LinkedList<ReceivedDeletedBlockInfo> receivedAndDeletedBlockList
- = new LinkedList<ReceivedDeletedBlockInfo>();
- private volatile int pendingReceivedRequests = 0;
- private volatile boolean shouldServiceRun = true;
UpgradeManagerDatanode upgradeManager = null;
private final DataNode dn;
- private final DNConf dnConf;
- BPOfferService(InetSocketAddress nnAddr, DataNode dn) {
+ /**
+ * A reference to the BPServiceActor associated with the currently
+ * ACTIVE NN. In the case that all NameNodes are in STANDBY mode,
+ * this can be null. If non-null, this must always refer to a member
+ * of the {@link #bpServices} list.
+ */
+ private BPServiceActor bpServiceToActive = null;
+
+ /**
+ * The list of all actors for namenodes in this nameservice, regardless
+ * of their active or standby states.
+ */
+ private List<BPServiceActor> bpServices =
+ new CopyOnWriteArrayList<BPServiceActor>();
+
+ /**
+ * Each time we receive a heartbeat from a NN claiming to be ACTIVE,
+ * we record that NN's most recent transaction ID here, so long as it
+ * is more recent than the previous value. This allows us to detect
+ * split-brain scenarios in which a prior NN is still asserting its
+ * ACTIVE state but with a too-low transaction ID. See HDFS-2627
+ * for details.
+ */
+ private long lastActiveClaimTxId = -1;
+
+ BPOfferService(List<InetSocketAddress> nnAddrs, DataNode dn) {
+ Preconditions.checkArgument(!nnAddrs.isEmpty(),
+ "Must pass at least one NN.");
this.dn = dn;
- this.nnAddr = nnAddr;
- this.dnConf = dn.getDnConf();
+
+ for (InetSocketAddress addr : nnAddrs) {
+ this.bpServices.add(new BPServiceActor(addr, this));
+ }
+ }
+
+ void refreshNNList(ArrayList<InetSocketAddress> addrs) throws IOException {
+ Set<InetSocketAddress> oldAddrs = Sets.newHashSet();
+ for (BPServiceActor actor : bpServices) {
+ oldAddrs.add(actor.getNNSocketAddress());
+ }
+ Set<InetSocketAddress> newAddrs = Sets.newHashSet(addrs);
+
+ if (!Sets.symmetricDifference(oldAddrs, newAddrs).isEmpty()) {
+ // Keep things simple for now -- we can implement this at a later date.
+ throw new IOException(
+ "HA does not currently support adding a new standby to a running DN. " +
+ "Please do a rolling restart of DNs to reconfigure the list of NNs.");
+ }
}
/**
- * returns true if BP thread has completed initialization of storage
- * and has registered with the corresponding namenode
- * @return true if initialized
+ * @return true if the service has registered with at least one NameNode.
*/
- public boolean isInitialized() {
- return initialized;
+ boolean isInitialized() {
+ return bpRegistration != null;
}
- public boolean isAlive() {
- return shouldServiceRun && bpThread.isAlive();
+ /**
+ * @return true if there is at least one actor thread running which is
+ * talking to a NameNode.
+ */
+ boolean isAlive() {
+ for (BPServiceActor actor : bpServices) {
+ if (actor.isAlive()) {
+ return true;
+ }
+ }
+ return false;
}
- public String getBlockPoolId() {
+ String getBlockPoolId() {
if (bpNSInfo != null) {
return bpNSInfo.getBlockPoolID();
} else {
@@ -138,10 +173,11 @@
}
}
- public NamespaceInfo getNamespaceInfo() {
+ synchronized NamespaceInfo getNamespaceInfo() {
return bpNSInfo;
}
+ @Override
public String toString() {
if (bpNSInfo == null) {
// If we haven't yet connected to our NN, we don't yet know our
@@ -153,522 +189,363 @@
storageId = "unknown";
}
return "Block pool <registering> (storage id " + storageId +
- ") connecting to " + nnAddr;
+ ")";
} else {
return "Block pool " + getBlockPoolId() +
" (storage id " + dn.getStorageId() +
- ") registered with " + nnAddr;
+ ")";
}
}
- InetSocketAddress getNNSocketAddress() {
- return nnAddr;
- }
-
- /**
- * Used to inject a spy NN in the unit tests.
- */
- @VisibleForTesting
- void setNameNode(DatanodeProtocolClientSideTranslatorPB dnProtocol) {
- bpNamenode = dnProtocol;
- }
-
- /**
- * Perform the first part of the handshake with the NameNode.
- * This calls <code>versionRequest</code> to determine the NN's
- * namespace and version info. It automatically retries until
- * the NN responds or the DN is shutting down.
- *
- * @return the NamespaceInfo
- * @throws IncorrectVersionException if the remote NN does not match
- * this DN's version
- */
- NamespaceInfo retrieveNamespaceInfo() throws IncorrectVersionException {
- NamespaceInfo nsInfo = null;
- while (shouldRun()) {
- try {
- nsInfo = bpNamenode.versionRequest();
- LOG.debug(this + " received versionRequest response: " + nsInfo);
- break;
- } catch(SocketTimeoutException e) { // namenode is busy
- LOG.warn("Problem connecting to server: " + nnAddr);
- } catch(IOException e ) { // namenode is not available
- LOG.warn("Problem connecting to server: " + nnAddr);
- }
-
- // try again in a second
- sleepAndLogInterrupts(5000, "requesting version info from NN");
- }
-
- if (nsInfo != null) {
- checkNNVersion(nsInfo);
- }
- return nsInfo;
- }
-
- private void checkNNVersion(NamespaceInfo nsInfo)
- throws IncorrectVersionException {
- // build and layout versions should match
- String nsBuildVer = nsInfo.getBuildVersion();
- String stBuildVer = Storage.getBuildVersion();
- if (!nsBuildVer.equals(stBuildVer)) {
- LOG.warn("Data-node and name-node Build versions must be the same. " +
- "Namenode build version: " + nsBuildVer + "Datanode " +
- "build version: " + stBuildVer);
- throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer);
- }
-
- if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) {
- LOG.warn("Data-node and name-node layout versions must be the same." +
- " Expected: "+ HdfsConstants.LAYOUT_VERSION +
- " actual "+ bpNSInfo.getLayoutVersion());
- throw new IncorrectVersionException(
- bpNSInfo.getLayoutVersion(), "namenode");
- }
- }
-
- private void connectToNNAndHandshake() throws IOException {
- // get NN proxy
- bpNamenode = dn.connectToNN(nnAddr);
-
- // First phase of the handshake with NN - get the namespace
- // info.
- bpNSInfo = retrieveNamespaceInfo();
-
- // Now that we know the namespace ID, etc, we can pass this to the DN.
- // The DN can now initialize its local storage if we are the
- // first BP to handshake, etc.
- dn.initBlockPool(this);
-
- // Second phase of the handshake with the NN.
- register();
- }
-
- /**
- * This methods arranges for the data node to send the block report at
- * the next heartbeat.
- */
- void scheduleBlockReport(long delay) {
- if (delay > 0) { // send BR after random delay
- lastBlockReport = System.currentTimeMillis()
- - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay)));
- } else { // send at next heartbeat
- lastBlockReport = lastHeartbeat - dnConf.blockReportInterval;
- }
- resetBlockReportTime = true; // reset future BRs for randomness
- }
-
void reportBadBlocks(ExtendedBlock block) {
- DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) };
- LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) };
-
- try {
- bpNamenode.reportBadBlocks(blocks);
- } catch (IOException e){
- /* One common reason is that NameNode could be in safe mode.
- * Should we keep on retrying in that case?
- */
- LOG.warn("Failed to report bad block " + block + " to namenode : "
- + " Exception", e);
+ checkBlock(block);
+ for (BPServiceActor actor : bpServices) {
+ actor.reportBadBlocks(block);
}
-
}
- /**
- * Report received blocks and delete hints to the Namenode
- *
- * @throws IOException
- */
- private void reportReceivedDeletedBlocks() throws IOException {
-
- // check if there are newly received blocks
- ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null;
- int currentReceivedRequestsCounter;
- synchronized (receivedAndDeletedBlockList) {
- currentReceivedRequestsCounter = pendingReceivedRequests;
- int numBlocks = receivedAndDeletedBlockList.size();
- if (numBlocks > 0) {
- //
- // Send newly-received and deleted blockids to namenode
- //
- receivedAndDeletedBlockArray = receivedAndDeletedBlockList
- .toArray(new ReceivedDeletedBlockInfo[numBlocks]);
- }
- }
- if (receivedAndDeletedBlockArray != null) {
- StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks(
- bpRegistration.getStorageID(), receivedAndDeletedBlockArray) };
- bpNamenode.blockReceivedAndDeleted(bpRegistration, getBlockPoolId(),
- report);
- synchronized (receivedAndDeletedBlockList) {
- for (int i = 0; i < receivedAndDeletedBlockArray.length; i++) {
- receivedAndDeletedBlockList.remove(receivedAndDeletedBlockArray[i]);
- }
- pendingReceivedRequests -= currentReceivedRequestsCounter;
- }
- }
- }
-
/*
* Informing the name node could take a long long time! Should we wait
* till namenode is informed before responding with success to the
* client? For now we don't.
*/
void notifyNamenodeReceivedBlock(ExtendedBlock block, String delHint) {
- if (block == null || delHint == null) {
- throw new IllegalArgumentException(block == null ? "Block is null"
- : "delHint is null");
- }
+ checkBlock(block);
+ checkDelHint(delHint);
+ ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
+ block.getLocalBlock(),
+ ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK,
+ delHint);
- if (!block.getBlockPoolId().equals(getBlockPoolId())) {
- LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. "
- + getBlockPoolId());
- return;
+ for (BPServiceActor actor : bpServices) {
+ actor.notifyNamenodeBlockImmediately(bInfo);
}
+ }
- synchronized (receivedAndDeletedBlockList) {
- receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block
- .getLocalBlock(), delHint));
- pendingReceivedRequests++;
- receivedAndDeletedBlockList.notifyAll();
- }
+ private void checkBlock(ExtendedBlock block) {
+ Preconditions.checkArgument(block != null,
+ "block is null");
+ Preconditions.checkArgument(block.getBlockPoolId().equals(getBlockPoolId()),
+ "block belongs to BP %s instead of BP %s",
+ block.getBlockPoolId(), getBlockPoolId());
+ }
+
+ private void checkDelHint(String delHint) {
+ Preconditions.checkArgument(delHint != null,
+ "delHint is null");
}
void notifyNamenodeDeletedBlock(ExtendedBlock block) {
- if (block == null) {
- throw new IllegalArgumentException("Block is null");
+ checkBlock(block);
+ ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
+ block.getLocalBlock(), BlockStatus.DELETED_BLOCK, null);
+
+ for (BPServiceActor actor : bpServices) {
+ actor.notifyNamenodeDeletedBlock(bInfo);
}
-
- if (!block.getBlockPoolId().equals(getBlockPoolId())) {
- LOG.warn("BlockPool mismatch " + block.getBlockPoolId() + " vs. "
- + getBlockPoolId());
- return;
- }
-
- synchronized (receivedAndDeletedBlockList) {
- receivedAndDeletedBlockList.add(new ReceivedDeletedBlockInfo(block
- .getLocalBlock(), ReceivedDeletedBlockInfo.TODELETE_HINT));
+ }
+
+ void notifyNamenodeReceivingBlock(ExtendedBlock block) {
+ checkBlock(block);
+ ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
+ block.getLocalBlock(), BlockStatus.RECEIVING_BLOCK, null);
+
+ for (BPServiceActor actor : bpServices) {
+ actor.notifyNamenodeBlockImmediately(bInfo);
}
}
-
- /**
- * Report the list blocks to the Namenode
- * @throws IOException
- */
- DatanodeCommand blockReport() throws IOException {
- // send block report if timer has expired.
- DatanodeCommand cmd = null;
- long startTime = now();
- if (startTime - lastBlockReport > dnConf.blockReportInterval) {
-
- // Create block report
- long brCreateStartTime = now();
- BlockListAsLongs bReport = dn.data.getBlockReport(getBlockPoolId());
-
- // Send block report
- long brSendStartTime = now();
- StorageBlockReport[] report = { new StorageBlockReport(
- bpRegistration.getStorageID(), bReport.getBlockListAsLongs()) };
- cmd = bpNamenode.blockReport(bpRegistration, getBlockPoolId(), report);
-
- // Log the block report processing stats from Datanode perspective
- long brSendCost = now() - brSendStartTime;
- long brCreateCost = brSendStartTime - brCreateStartTime;
- dn.metrics.addBlockReport(brSendCost);
- LOG.info("BlockReport of " + bReport.getNumberOfBlocks()
- + " blocks took " + brCreateCost + " msec to generate and "
- + brSendCost + " msecs for RPC and NN processing");
-
- // If we have sent the first block report, then wait a random
- // time before we start the periodic block reports.
- if (resetBlockReportTime) {
- lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval));
- resetBlockReportTime = false;
- } else {
- /* say the last block report was at 8:20:14. The current report
- * should have started around 9:20:14 (default 1 hour interval).
- * If current time is :
- * 1) normal like 9:20:18, next report should be at 10:20:14
- * 2) unexpected like 11:35:43, next report should be at 12:20:14
- */
- lastBlockReport += (now() - lastBlockReport) /
- dnConf.blockReportInterval * dnConf.blockReportInterval;
- }
- LOG.info("sent block report, processed command:" + cmd);
- }
- return cmd;
- }
-
-
- DatanodeCommand [] sendHeartBeat() throws IOException {
- // reports number of failed volumes
- StorageReport[] report = { new StorageReport(bpRegistration.getStorageID(),
- false, dn.data.getCapacity(), dn.data.getDfsUsed(),
- dn.data.getRemaining(), dn.data.getBlockPoolUsed(getBlockPoolId())) };
- return bpNamenode.sendHeartbeat(bpRegistration, report,
- dn.xmitsInProgress.get(),
- dn.getXceiverCount(), dn.data.getNumFailedVolumes());
- }
-
//This must be called only by blockPoolManager
void start() {
- if ((bpThread != null) && (bpThread.isAlive())) {
- //Thread is started already
- return;
+ for (BPServiceActor actor : bpServices) {
+ actor.start();
}
- bpThread = new Thread(this, formatThreadName());
- bpThread.setDaemon(true); // needed for JUnit testing
- bpThread.start();
- }
-
- private String formatThreadName() {
- Collection<URI> dataDirs = DataNode.getStorageDirs(dn.getConf());
- return "DataNode: [" +
- StringUtils.uriToString(dataDirs.toArray(new URI[0])) + "] " +
- " heartbeating to " + nnAddr;
}
//This must be called only by blockPoolManager.
void stop() {
- shouldServiceRun = false;
- if (bpThread != null) {
- bpThread.interrupt();
+ for (BPServiceActor actor : bpServices) {
+ actor.stop();
}
}
//This must be called only by blockPoolManager
void join() {
- try {
- if (bpThread != null) {
- bpThread.join();
- }
- } catch (InterruptedException ie) { }
+ for (BPServiceActor actor : bpServices) {
+ actor.join();
+ }
+ }
+
+ synchronized UpgradeManagerDatanode getUpgradeManager() {
+ if(upgradeManager == null)
+ upgradeManager =
+ new UpgradeManagerDatanode(dn, getBlockPoolId());
+
+ return upgradeManager;
}
- //Cleanup method to be called by current thread before exiting.
- private synchronized void cleanUp() {
-
- if(upgradeManager != null)
- upgradeManager.shutdownUpgrade();
- shouldServiceRun = false;
- IOUtils.cleanup(LOG, bpNamenode);
- dn.shutdownBlockPool(this);
+ void processDistributedUpgradeCommand(UpgradeCommand comm)
+ throws IOException {
+ UpgradeManagerDatanode upgradeManager = getUpgradeManager();
+ upgradeManager.processUpgradeCommand(comm);
}
/**
- * Main loop for each BP thread. Run until shutdown,
- * forever calling remote NameNode functions.
+ * Start distributed upgrade if it should be initiated by the data-node.
*/
- private void offerService() throws Exception {
- LOG.info("For namenode " + nnAddr + " using DELETEREPORT_INTERVAL of "
- + dnConf.deleteReportInterval + " msec " + " BLOCKREPORT_INTERVAL of "
- + dnConf.blockReportInterval + "msec" + " Initial delay: "
- + dnConf.initialBlockReportDelay + "msec" + "; heartBeatInterval="
- + dnConf.heartBeatInterval);
-
- //
- // Now loop for a long time....
- //
- while (shouldRun()) {
- try {
- long startTime = now();
-
- //
- // Every so often, send heartbeat or block-report
- //
- if (startTime - lastHeartbeat > dnConf.heartBeatInterval) {
- //
- // All heartbeat messages include following info:
- // -- Datanode name
- // -- data transfer port
- // -- Total capacity
- // -- Bytes remaining
- //
- lastHeartbeat = startTime;
- if (!dn.areHeartbeatsDisabledForTests()) {
- DatanodeCommand[] cmds = sendHeartBeat();
- dn.metrics.addHeartbeat(now() - startTime);
-
- long startProcessCommands = now();
- if (!processCommand(cmds))
- continue;
- long endProcessCommands = now();
- if (endProcessCommands - startProcessCommands > 2000) {
- LOG.info("Took " + (endProcessCommands - startProcessCommands) +
- "ms to process " + cmds.length + " commands from NN");
- }
- }
- }
- if (pendingReceivedRequests > 0
- || (startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
- reportReceivedDeletedBlocks();
- lastDeletedReport = startTime;
- }
-
- DatanodeCommand cmd = blockReport();
- processCommand(cmd);
-
- // Now safe to start scanning the block pool
- if (dn.blockScanner != null) {
- dn.blockScanner.addBlockPool(this.getBlockPoolId());
- }
-
- //
- // There is no work to do; sleep until hearbeat timer elapses,
- // or work arrives, and then iterate again.
- //
- long waitTime = dnConf.heartBeatInterval -
- (System.currentTimeMillis() - lastHeartbeat);
- synchronized(receivedAndDeletedBlockList) {
- if (waitTime > 0 && pendingReceivedRequests == 0) {
- try {
- receivedAndDeletedBlockList.wait(waitTime);
- } catch (InterruptedException ie) {
- LOG.warn("BPOfferService for " + this + " interrupted");
- }
- }
- } // synchronized
- } catch(RemoteException re) {
- String reClass = re.getClassName();
- if (UnregisteredNodeException.class.getName().equals(reClass) ||
- DisallowedDatanodeException.class.getName().equals(reClass) ||
- IncorrectVersionException.class.getName().equals(reClass)) {
- LOG.warn(this + " is shutting down", re);
- shouldServiceRun = false;
- return;
- }
- LOG.warn("RemoteException in offerService", re);
- try {
- long sleepTime = Math.min(1000, dnConf.heartBeatInterval);
- Thread.sleep(sleepTime);
- } catch (InterruptedException ie) {
- Thread.currentThread().interrupt();
- }
- } catch (IOException e) {
- LOG.warn("IOException in offerService", e);
- }
- } // while (shouldRun())
- } // offerService
-
- /**
- * Register one bp with the corresponding NameNode
- * <p>
- * The bpDatanode needs to register with the namenode on startup in order
- * 1) to report which storage it is serving now and
- * 2) to receive a registrationID
- *
- * issued by the namenode to recognize registered datanodes.
- *
- * @see FSNamesystem#registerDatanode(DatanodeRegistration)
- * @throws IOException
- */
- void register() throws IOException {
- Preconditions.checkState(bpNSInfo != null,
- "register() should be called after handshake()");
+ synchronized void startDistributedUpgradeIfNeeded() throws IOException {
+ UpgradeManagerDatanode um = getUpgradeManager();
- // The handshake() phase loaded the block pool storage
- // off disk - so update the bpRegistration object from that info
- bpRegistration = dn.createBPRegistration(bpNSInfo);
-
- LOG.info(this + " beginning handshake with NN");
-
- while (shouldRun()) {
- try {
- // Use returned registration from namenode with updated machine name.
- bpRegistration = bpNamenode.registerDatanode(bpRegistration,
- new DatanodeStorage[0]);
- break;
- } catch(SocketTimeoutException e) { // namenode is busy
- LOG.info("Problem connecting to server: " + nnAddr);
- sleepAndLogInterrupts(1000, "connecting to server");
- }
- }
-
- LOG.info("Block pool " + this + " successfully registered with NN");
- dn.bpRegistrationSucceeded(bpRegistration, getBlockPoolId());
-
- // random short delay - helps scatter the BR from all DNs
- scheduleBlockReport(dnConf.initialBlockReportDelay);
+ if(!um.getUpgradeState())
+ return;
+ um.setUpgradeState(false, um.getUpgradeVersion());
+ um.startUpgrade();
+ return;
}
-
-
- private void sleepAndLogInterrupts(int millis,
- String stateString) {
- try {
- Thread.sleep(millis);
- } catch (InterruptedException ie) {
- LOG.info("BPOfferService " + this +
- " interrupted while " + stateString);
- }
+
+ DataNode getDataNode() {
+ return dn;
}
/**
- * No matter what kind of exception we get, keep retrying to offerService().
- * That's the loop that connects to the NameNode and provides basic DataNode
- * functionality.
- *
- * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can
- * happen either at shutdown or due to refreshNamenodes.
+ * Called by the BPServiceActors when they handshake to a NN.
+ * If this is the first NN connection, this sets the namespace info
+ * for this BPOfferService. If it's a connection to a new NN, it
+ * verifies that this namespace matches (eg to prevent a misconfiguration
+ * where a StandbyNode from a different cluster is specified)
*/
- @Override
- public void run() {
- LOG.info(this + " starting to offer service");
-
- try {
- // init stuff
- try {
- // setup storage
- connectToNNAndHandshake();
- } catch (IOException ioe) {
- // Initial handshake, storage recovery or registration failed
- // End BPOfferService thread
- LOG.fatal("Initialization failed for block pool " + this, ioe);
- return;
- }
-
- initialized = true; // bp is initialized;
+ synchronized void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException {
+ if (this.bpNSInfo == null) {
+ this.bpNSInfo = nsInfo;
- while (shouldRun()) {
- try {
- startDistributedUpgradeIfNeeded();
- offerService();
- } catch (Exception ex) {
- LOG.error("Exception in BPOfferService for " + this, ex);
- sleepAndLogInterrupts(5000, "offering service");
- }
- }
- } catch (Throwable ex) {
- LOG.warn("Unexpected exception in block pool " + this, ex);
- } finally {
- LOG.warn("Ending block pool service for: " + this);
- cleanUp();
+ // Now that we know the namespace ID, etc, we can pass this to the DN.
+ // The DN can now initialize its local storage if we are the
+ // first BP to handshake, etc.
+ dn.initBlockPool(this);
+ return;
+ } else {
+ checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(),
+ "Blockpool ID");
+ checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(),
+ "Namespace ID");
+ checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(),
+ "Cluster ID");
}
}
- private boolean shouldRun() {
- return shouldServiceRun && dn.shouldRun();
- }
-
/**
- * Process an array of datanode commands
- *
- * @param cmds an array of datanode commands
- * @return true if further processing may be required or false otherwise.
+ * After one of the BPServiceActors registers successfully with the
+ * NN, it calls this function to verify that the NN it connected to
+ * is consistent with other NNs serving the block-pool.
*/
- private boolean processCommand(DatanodeCommand[] cmds) {
- if (cmds != null) {
- for (DatanodeCommand cmd : cmds) {
- try {
- if (processCommand(cmd) == false) {
- return false;
- }
- } catch (IOException ioe) {
- LOG.warn("Error processing datanode Command", ioe);
- }
+ void registrationSucceeded(BPServiceActor bpServiceActor,
+ DatanodeRegistration reg) throws IOException {
+ if (bpRegistration != null) {
+ checkNSEquality(bpRegistration.storageInfo.getNamespaceID(),
+ reg.storageInfo.getNamespaceID(), "namespace ID");
+ checkNSEquality(bpRegistration.storageInfo.getClusterID(),
+ reg.storageInfo.getClusterID(), "cluster ID");
+ } else {
+ bpRegistration = reg;
+ }
+
+ dn.bpRegistrationSucceeded(bpRegistration, getBlockPoolId());
+ }
+
+ /**
+ * Verify equality of two namespace-related fields, throwing
+ * an exception if they are unequal.
+ */
+ private static void checkNSEquality(
+ Object ourID, Object theirID,
+ String idHelpText) throws IOException {
+ if (!ourID.equals(theirID)) {
+ throw new IOException(idHelpText + " mismatch: " +
+ "previously connected to " + idHelpText + " " + ourID +
+ " but now connected to " + idHelpText + " " + theirID);
+ }
+ }
+
+ synchronized DatanodeRegistration createRegistration() {
+ Preconditions.checkState(bpNSInfo != null,
+ "getRegistration() can only be called after initial handshake");
+ return dn.createBPRegistration(bpNSInfo);
+ }
+
+ /**
+ * Called when an actor shuts down. If this is the last actor
+ * to shut down, shuts down the whole blockpool in the DN.
+ */
+ synchronized void shutdownActor(BPServiceActor actor) {
+ if (bpServiceToActive == actor) {
+ bpServiceToActive = null;
+ }
+
+ bpServices.remove(actor);
+
+ if (bpServices.isEmpty()) {
+ dn.shutdownBlockPool(this);
+
+ if(upgradeManager != null)
+ upgradeManager.shutdownUpgrade();
+ }
+ }
+
+ /**
+ * Called by the DN to report an error to the NNs.
+ */
+ void trySendErrorReport(int errCode, String errMsg) {
+ for (BPServiceActor actor : bpServices) {
+ actor.trySendErrorReport(errCode, errMsg);
+ }
+ }
+
+ /**
+ * Ask each of the actors to schedule a block report after
+ * the specified delay.
+ */
+ void scheduleBlockReport(long delay) {
+ for (BPServiceActor actor : bpServices) {
+ actor.scheduleBlockReport(delay);
+ }
+ }
+
+ /**
+ * Ask each of the actors to report a bad block hosted on another DN.
+ */
+ void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block) {
+ for (BPServiceActor actor : bpServices) {
+ try {
+ actor.reportRemoteBadBlock(dnInfo, block);
+ } catch (IOException e) {
+ LOG.warn("Couldn't report bad block " + block + " to " + actor,
+ e);
}
}
- return true;
+ }
+
+ /**
+ * @return a proxy to the active NN, or null if the BPOS has not
+ * acknowledged any NN as active yet.
+ */
+ synchronized DatanodeProtocolClientSideTranslatorPB getActiveNN() {
+ if (bpServiceToActive != null) {
+ return bpServiceToActive.bpNamenode;
+ } else {
+ return null;
+ }
+ }
+
+ @VisibleForTesting
+ synchronized List<BPServiceActor> getBPServiceActors() {
+ return Lists.newArrayList(bpServices);
+ }
+
+ /**
+ * Update the BPOS's view of which NN is active, based on a heartbeat
+ * response from one of the actors.
+ *
+ * @param actor the actor which received the heartbeat
+ * @param nnHaState the HA-related heartbeat contents
+ */
+ synchronized void updateActorStatesFromHeartbeat(
+ BPServiceActor actor,
+ NNHAStatusHeartbeat nnHaState) {
+ final long txid = nnHaState.getTxId();
+
+ final boolean nnClaimsActive =
+ nnHaState.getState() == NNHAStatusHeartbeat.State.ACTIVE;
+ final boolean bposThinksActive = bpServiceToActive == actor;
+ final boolean isMoreRecentClaim = txid > lastActiveClaimTxId;
+
+ if (nnClaimsActive && !bposThinksActive) {
+ LOG.info("Namenode " + actor + " trying to claim ACTIVE state with " +
+ "txid=" + txid);
+ if (!isMoreRecentClaim) {
+ // Split-brain scenario - an NN is trying to claim active
+ // state when a different NN has already claimed it with a higher
+ // txid.
+ LOG.warn("NN " + actor + " tried to claim ACTIVE state at txid=" +
+ txid + " but there was already a more recent claim at txid=" +
+ lastActiveClaimTxId);
+ return;
+ } else {
+ if (bpServiceToActive == null) {
+ LOG.info("Acknowledging ACTIVE Namenode " + actor);
+ } else {
+ LOG.info("Namenode " + actor + " taking over ACTIVE state from " +
+ bpServiceToActive + " at higher txid=" + txid);
+ }
+ bpServiceToActive = actor;
+ }
+ } else if (!nnClaimsActive && bposThinksActive) {
+ LOG.info("Namenode " + actor + " relinquishing ACTIVE state with " +
+ "txid=" + nnHaState.getTxId());
+ bpServiceToActive = null;
+ }
+
+ if (bpServiceToActive == actor) {
+ assert txid >= lastActiveClaimTxId;
+ lastActiveClaimTxId = txid;
+ }
+ }
+
+ /**
+ * @return true if the given NN address is one of the NNs for this
+ * block pool
+ */
+ boolean containsNN(InetSocketAddress addr) {
+ for (BPServiceActor actor : bpServices) {
+ if (actor.getNNSocketAddress().equals(addr)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @VisibleForTesting
+ int countNameNodes() {
+ return bpServices.size();
+ }
+
+ /**
+ * Run an immediate block report on this thread. Used by tests.
+ */
+ @VisibleForTesting
+ void triggerBlockReportForTests() throws IOException {
+ for (BPServiceActor actor : bpServices) {
+ actor.triggerBlockReportForTests();
+ }
+ }
+
+ /**
+ * Run an immediate deletion report on this thread. Used by tests.
+ */
+ @VisibleForTesting
+ void triggerDeletionReportForTests() throws IOException {
+ for (BPServiceActor actor : bpServices) {
+ actor.triggerDeletionReportForTests();
+ }
+ }
+
+ /**
+ * Run an immediate heartbeat from all actors. Used by tests.
+ */
+ @VisibleForTesting
+ void triggerHeartbeatForTests() throws IOException {
+ for (BPServiceActor actor : bpServices) {
+ actor.triggerHeartbeatForTests();
+ }
+ }
+
+ synchronized boolean processCommandFromActor(DatanodeCommand cmd,
+ BPServiceActor actor) throws IOException {
+ assert bpServices.contains(actor);
+ if (actor == bpServiceToActive) {
+ return processCommandFromActive(cmd, actor);
+ } else {
+ return processCommandFromStandby(cmd, actor);
+ }
}
/**
@@ -677,7 +554,8 @@
* @return true if further processing may be required or false otherwise.
* @throws IOException
*/
- private boolean processCommand(DatanodeCommand cmd) throws IOException {
+ private boolean processCommandFromActive(DatanodeCommand cmd,
+ BPServiceActor actor) throws IOException {
if (cmd == null)
return true;
final BlockCommand bcmd =
@@ -708,19 +586,13 @@
dn.metrics.incrBlocksRemoved(toDelete.length);
break;
case DatanodeProtocol.DNA_SHUTDOWN:
- // shut down the data node
- shouldServiceRun = false;
- return false;
+ // TODO: DNA_SHUTDOWN appears to be unused - the NN never sends this command
+ // See HDFS-2987.
+ throw new UnsupportedOperationException("Received unimplemented DNA_SHUTDOWN");
case DatanodeProtocol.DNA_REGISTER:
// namenode requested a registration - at start or if NN lost contact
LOG.info("DatanodeCommand action: DNA_REGISTER");
- if (shouldRun()) {
- // re-retrieve namespace info to make sure that, if the NN
- // was restarted, we still match its version (HDFS-2120)
- retrieveNamespaceInfo();
- // and re-register
- register();
- }
+ actor.reRegister();
break;
case DatanodeProtocol.DNA_FINALIZE:
String bp = ((FinalizeCommand) cmd).getBlockPoolId();
@@ -740,7 +612,8 @@
case DatanodeProtocol.DNA_ACCESSKEYUPDATE:
LOG.info("DatanodeCommand action: DNA_ACCESSKEYUPDATE");
if (dn.isBlockTokenEnabled) {
- dn.blockPoolTokenSecretManager.setKeys(getBlockPoolId(),
+ dn.blockPoolTokenSecretManager.setKeys(
+ getBlockPoolId(),
((KeyUpdateCommand) cmd).getExportedKeys());
}
break;
@@ -759,41 +632,29 @@
}
return true;
}
-
- private void processDistributedUpgradeCommand(UpgradeCommand comm)
- throws IOException {
- UpgradeManagerDatanode upgradeManager = getUpgradeManager();
- upgradeManager.processUpgradeCommand(comm);
+
+ private boolean processCommandFromStandby(DatanodeCommand cmd,
+ BPServiceActor actor) throws IOException {
+ if (cmd == null)
+ return true;
+ switch(cmd.getAction()) {
+ case DatanodeProtocol.DNA_REGISTER:
+ // namenode requested a registration - at start or if NN lost contact
+ LOG.info("DatanodeCommand action: DNA_REGISTER");
+ actor.reRegister();
+ return true;
+ case DatanodeProtocol.DNA_TRANSFER:
+ case DatanodeProtocol.DNA_INVALIDATE:
+ case DatanodeProtocol.DNA_SHUTDOWN:
+ case DatanodeProtocol.DNA_RECOVERBLOCK:
+ case DatanodeProtocol.DNA_ACCESSKEYUPDATE:
+ case DatanodeProtocol.DNA_BALANCERBANDWIDTHUPDATE:
+ LOG.warn("Got a command from standby NN - ignoring command:" + cmd.getAction());
+ return true;
+ default:
+ LOG.warn("Unknown DatanodeCommand action: " + cmd.getAction());
+ }
+ return true;
}
- synchronized UpgradeManagerDatanode getUpgradeManager() {
- if(upgradeManager == null)
- upgradeManager =
- new UpgradeManagerDatanode(dn, getBlockPoolId());
-
- return upgradeManager;
- }
-
- /**
- * Start distributed upgrade if it should be initiated by the data-node.
- */
- private void startDistributedUpgradeIfNeeded() throws IOException {
- UpgradeManagerDatanode um = getUpgradeManager();
-
- if(!um.getUpgradeState())
- return;
- um.setUpgradeState(false, um.getUpgradeVersion());
- um.startUpgrade();
- return;
- }
-
- @VisibleForTesting
- DatanodeProtocolClientSideTranslatorPB getBpNamenode() {
- return bpNamenode;
- }
-
- @VisibleForTesting
- void setBpNamenode(DatanodeProtocolClientSideTranslatorPB bpNamenode) {
- this.bpNamenode = bpNamenode;
- }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
new file mode 100644
index 0000000..75f32cb
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java
@@ -0,0 +1,730 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.datanode;
+
+import static org.apache.hadoop.hdfs.server.common.Util.now;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.SocketTimeoutException;
+import java.net.URI;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.UnregisteredNodeException;
+import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
+import org.apache.hadoop.hdfs.server.common.Storage;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
+import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
+import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
+import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
+import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Maps;
+
+/**
+ * A thread per active or standby namenode to perform:
+ * <ul>
+ * <li> Pre-registration handshake with namenode</li>
+ * <li> Registration with namenode</li>
+ * <li> Send periodic heartbeats to the namenode</li>
+ * <li> Handle commands received from the namenode</li>
+ * </ul>
+ */
+@InterfaceAudience.Private
+class BPServiceActor implements Runnable {
+
+ static final Log LOG = DataNode.LOG;
+ final InetSocketAddress nnAddr;
+
+ BPOfferService bpos;
+
+ long lastBlockReport = 0;
+ long lastDeletedReport = 0;
+
+ boolean resetBlockReportTime = true;
+
+ Thread bpThread;
+ DatanodeProtocolClientSideTranslatorPB bpNamenode;
+ private long lastHeartbeat = 0;
+ private volatile boolean initialized = false;
+
+ /**
+ * Between block reports (which happen on the order of once an hour) the
+ * DN reports smaller incremental changes to its block list. This map,
+ * keyed by block ID, contains the pending changes which have yet to be
+ * reported to the NN. Access should be synchronized on this object.
+ */
+ private final Map<Long, ReceivedDeletedBlockInfo> pendingIncrementalBR
+ = Maps.newHashMap();
+
+ private volatile int pendingReceivedRequests = 0;
+ private volatile boolean shouldServiceRun = true;
+ private final DataNode dn;
+ private final DNConf dnConf;
+
+ private DatanodeRegistration bpRegistration;
+
+ BPServiceActor(InetSocketAddress nnAddr, BPOfferService bpos) {
+ this.bpos = bpos;
+ this.dn = bpos.getDataNode();
+ this.nnAddr = nnAddr;
+ this.dnConf = dn.getDnConf();
+ }
+
+ /**
+ * returns true if BP thread has completed initialization of storage
+ * and has registered with the corresponding namenode
+ * @return true if initialized
+ */
+ boolean isInitialized() {
+ return initialized;
+ }
+
+ boolean isAlive() {
+ return shouldServiceRun && bpThread.isAlive();
+ }
+
+ @Override
+ public String toString() {
+ return bpos.toString() + " service to " + nnAddr;
+ }
+
+ InetSocketAddress getNNSocketAddress() {
+ return nnAddr;
+ }
+
+ /**
+ * Used to inject a spy NN in the unit tests.
+ */
+ @VisibleForTesting
+ void setNameNode(DatanodeProtocolClientSideTranslatorPB dnProtocol) {
+ bpNamenode = dnProtocol;
+ }
+
+ @VisibleForTesting
+ DatanodeProtocolClientSideTranslatorPB getNameNodeProxy() {
+ return bpNamenode;
+ }
+
+ /**
+ * Perform the first part of the handshake with the NameNode.
+ * This calls <code>versionRequest</code> to determine the NN's
+ * namespace and version info. It automatically retries until
+ * the NN responds or the DN is shutting down.
+ *
+ * @return the NamespaceInfo
+ */
+ @VisibleForTesting
+ NamespaceInfo retrieveNamespaceInfo() throws IOException {
+ NamespaceInfo nsInfo = null;
+ while (shouldRun()) {
+ try {
+ nsInfo = bpNamenode.versionRequest();
+ LOG.debug(this + " received versionRequest response: " + nsInfo);
+ break;
+ } catch(SocketTimeoutException e) { // namenode is busy
+ LOG.warn("Problem connecting to server: " + nnAddr);
+ } catch(IOException e ) { // namenode is not available
+ LOG.warn("Problem connecting to server: " + nnAddr);
+ }
+
+ // try again in a second
+ sleepAndLogInterrupts(5000, "requesting version info from NN");
+ }
+
+ if (nsInfo != null) {
+ checkNNVersion(nsInfo);
+ } else {
+ throw new IOException("DN shut down before block pool connected");
+ }
+ return nsInfo;
+ }
+
+ private void checkNNVersion(NamespaceInfo nsInfo)
+ throws IncorrectVersionException {
+ // build and layout versions should match
+ String nsBuildVer = nsInfo.getBuildVersion();
+ String stBuildVer = Storage.getBuildVersion();
+ if (!nsBuildVer.equals(stBuildVer)) {
+ LOG.warn("Data-node and name-node Build versions must be the same. " +
+ "Namenode build version: " + nsBuildVer + "Datanode " +
+ "build version: " + stBuildVer);
+ throw new IncorrectVersionException(nsBuildVer, "namenode", stBuildVer);
+ }
+
+ if (HdfsConstants.LAYOUT_VERSION != nsInfo.getLayoutVersion()) {
+ LOG.warn("Data-node and name-node layout versions must be the same." +
+ " Expected: "+ HdfsConstants.LAYOUT_VERSION +
+ " actual "+ nsInfo.getLayoutVersion());
+ throw new IncorrectVersionException(
+ nsInfo.getLayoutVersion(), "namenode");
+ }
+ }
+
+ private void connectToNNAndHandshake() throws IOException {
+ // get NN proxy
+ bpNamenode = dn.connectToNN(nnAddr);
+
+ // First phase of the handshake with NN - get the namespace
+ // info.
+ NamespaceInfo nsInfo = retrieveNamespaceInfo();
+
+ // Verify that this matches the other NN in this HA pair.
+ // This also initializes our block pool in the DN if we are
+ // the first NN connection for this BP.
+ bpos.verifyAndSetNamespaceInfo(nsInfo);
+
+ // Second phase of the handshake with the NN.
+ register();
+ }
+
+ /**
+ * This methods arranges for the data node to send the block report at
+ * the next heartbeat.
+ */
+ void scheduleBlockReport(long delay) {
+ if (delay > 0) { // send BR after random delay
+ lastBlockReport = System.currentTimeMillis()
+ - ( dnConf.blockReportInterval - DFSUtil.getRandom().nextInt((int)(delay)));
+ } else { // send at next heartbeat
+ lastBlockReport = lastHeartbeat - dnConf.blockReportInterval;
+ }
+ resetBlockReportTime = true; // reset future BRs for randomness
+ }
+
+ void reportBadBlocks(ExtendedBlock block) {
+ DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) };
+ LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) };
+
+ try {
+ bpNamenode.reportBadBlocks(blocks);
+ } catch (IOException e){
+ /* One common reason is that NameNode could be in safe mode.
+ * Should we keep on retrying in that case?
+ */
+ LOG.warn("Failed to report bad block " + block + " to namenode : "
+ + " Exception", e);
+ }
+ }
+
+ /**
+ * Report received blocks and delete hints to the Namenode
+ *
+ * @throws IOException
+ */
+ private void reportReceivedDeletedBlocks() throws IOException {
+
+ // check if there are newly received blocks
+ ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null;
+ synchronized (pendingIncrementalBR) {
+ int numBlocks = pendingIncrementalBR.size();
+ if (numBlocks > 0) {
+ //
+ // Send newly-received and deleted blockids to namenode
+ //
+ receivedAndDeletedBlockArray = pendingIncrementalBR
+ .values().toArray(new ReceivedDeletedBlockInfo[numBlocks]);
+ }
+ pendingIncrementalBR.clear();
+ }
+ if (receivedAndDeletedBlockArray != null) {
+ StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks(
+ bpRegistration.getStorageID(), receivedAndDeletedBlockArray) };
+ boolean success = false;
+ try {
+ bpNamenode.blockReceivedAndDeleted(bpRegistration, bpos.getBlockPoolId(),
+ report);
+ success = true;
+ } finally {
+ synchronized (pendingIncrementalBR) {
+ if (!success) {
+ // If we didn't succeed in sending the report, put all of the
+ // blocks back onto our queue, but only in the case where we didn't
+ // put something newer in the meantime.
+ for (ReceivedDeletedBlockInfo rdbi : receivedAndDeletedBlockArray) {
+ if (!pendingIncrementalBR.containsKey(rdbi.getBlock().getBlockId())) {
+ pendingIncrementalBR.put(rdbi.getBlock().getBlockId(), rdbi);
+ }
+ }
+ }
+ pendingReceivedRequests = pendingIncrementalBR.size();
+ }
+ }
+ }
+ }
+
+ /*
+ * Informing the name node could take a long long time! Should we wait
+ * till namenode is informed before responding with success to the
+ * client? For now we don't.
+ */
+ void notifyNamenodeBlockImmediately(ReceivedDeletedBlockInfo bInfo) {
+ synchronized (pendingIncrementalBR) {
+ pendingIncrementalBR.put(
+ bInfo.getBlock().getBlockId(), bInfo);
+ pendingReceivedRequests++;
+ pendingIncrementalBR.notifyAll();
+ }
+ }
+
+ void notifyNamenodeDeletedBlock(ReceivedDeletedBlockInfo bInfo) {
+ synchronized (pendingIncrementalBR) {
+ pendingIncrementalBR.put(
+ bInfo.getBlock().getBlockId(), bInfo);
+ }
+ }
+
+ /**
+ * Run an immediate block report on this thread. Used by tests.
+ */
+ @VisibleForTesting
+ void triggerBlockReportForTests() throws IOException {
+ synchronized (pendingIncrementalBR) {
+ lastBlockReport = 0;
+ lastHeartbeat = 0;
+ pendingIncrementalBR.notifyAll();
+ while (lastBlockReport == 0) {
+ try {
+ pendingIncrementalBR.wait(100);
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ }
+ }
+
+ @VisibleForTesting
+ void triggerHeartbeatForTests() throws IOException {
+ synchronized (pendingIncrementalBR) {
+ lastHeartbeat = 0;
+ pendingIncrementalBR.notifyAll();
+ while (lastHeartbeat == 0) {
+ try {
+ pendingIncrementalBR.wait(100);
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ }
+ }
+
+ @VisibleForTesting
+ void triggerDeletionReportForTests() throws IOException {
+ synchronized (pendingIncrementalBR) {
+ lastDeletedReport = 0;
+ pendingIncrementalBR.notifyAll();
+
+ while (lastDeletedReport == 0) {
+ try {
+ pendingIncrementalBR.wait(100);
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ }
+ }
+
+ /**
+ * Report the list blocks to the Namenode
+ * @throws IOException
+ */
+ DatanodeCommand blockReport() throws IOException {
+ // send block report if timer has expired.
+ DatanodeCommand cmd = null;
+ long startTime = now();
+ if (startTime - lastBlockReport > dnConf.blockReportInterval) {
+
+ // Flush any block information that precedes the block report. Otherwise
+ // we have a chance that we will miss the delHint information
+ // or we will report an RBW replica after the BlockReport already reports
+ // a FINALIZED one.
+ reportReceivedDeletedBlocks();
+
+ // Create block report
+ long brCreateStartTime = now();
+ BlockListAsLongs bReport = dn.getFSDataset().getBlockReport(
+ bpos.getBlockPoolId());
+
+ // Send block report
+ long brSendStartTime = now();
+ StorageBlockReport[] report = { new StorageBlockReport(
+ bpRegistration.getStorageID(), bReport.getBlockListAsLongs()) };
+ cmd = bpNamenode.blockReport(bpRegistration, bpos.getBlockPoolId(), report);
+
+ // Log the block report processing stats from Datanode perspective
+ long brSendCost = now() - brSendStartTime;
+ long brCreateCost = brSendStartTime - brCreateStartTime;
+ dn.getMetrics().addBlockReport(brSendCost);
+ LOG.info("BlockReport of " + bReport.getNumberOfBlocks()
+ + " blocks took " + brCreateCost + " msec to generate and "
+ + brSendCost + " msecs for RPC and NN processing");
+
+ // If we have sent the first block report, then wait a random
+ // time before we start the periodic block reports.
+ if (resetBlockReportTime) {
+ lastBlockReport = startTime - DFSUtil.getRandom().nextInt((int)(dnConf.blockReportInterval));
+ resetBlockReportTime = false;
+ } else {
+ /* say the last block report was at 8:20:14. The current report
+ * should have started around 9:20:14 (default 1 hour interval).
+ * If current time is :
+ * 1) normal like 9:20:18, next report should be at 10:20:14
+ * 2) unexpected like 11:35:43, next report should be at 12:20:14
+ */
+ lastBlockReport += (now() - lastBlockReport) /
+ dnConf.blockReportInterval * dnConf.blockReportInterval;
+ }
+ LOG.info("sent block report, processed command:" + cmd);
+ }
+ return cmd;
+ }
+
+
+ HeartbeatResponse sendHeartBeat() throws IOException {
+ LOG.info("heartbeat: " + this);
+ // reports number of failed volumes
+ StorageReport[] report = { new StorageReport(bpRegistration.getStorageID(),
+ false,
+ dn.getFSDataset().getCapacity(),
+ dn.getFSDataset().getDfsUsed(),
+ dn.getFSDataset().getRemaining(),
+ dn.getFSDataset().getBlockPoolUsed(bpos.getBlockPoolId())) };
+ return bpNamenode.sendHeartbeat(bpRegistration, report,
+ dn.getXmitsInProgress(),
+ dn.getXceiverCount(),
+ dn.getFSDataset().getNumFailedVolumes());
+ }
+
+ //This must be called only by BPOfferService
+ void start() {
+ if ((bpThread != null) && (bpThread.isAlive())) {
+ //Thread is started already
+ return;
+ }
+ bpThread = new Thread(this, formatThreadName());
+ bpThread.setDaemon(true); // needed for JUnit testing
+ bpThread.start();
+ }
+
+ private String formatThreadName() {
+ Collection<URI> dataDirs = DataNode.getStorageDirs(dn.getConf());
+ return "DataNode: [" +
+ StringUtils.uriToString(dataDirs.toArray(new URI[0])) + "] " +
+ " heartbeating to " + nnAddr;
+ }
+
+ //This must be called only by blockPoolManager.
+ void stop() {
+ shouldServiceRun = false;
+ if (bpThread != null) {
+ bpThread.interrupt();
+ }
+ }
+
+ //This must be called only by blockPoolManager
+ void join() {
+ try {
+ if (bpThread != null) {
+ bpThread.join();
+ }
+ } catch (InterruptedException ie) { }
+ }
+
+ //Cleanup method to be called by current thread before exiting.
+ private synchronized void cleanUp() {
+
+ shouldServiceRun = false;
+ IOUtils.cleanup(LOG, bpNamenode);
+ bpos.shutdownActor(this);
+ }
+
+ /**
+ * Main loop for each BP thread. Run until shutdown,
+ * forever calling remote NameNode functions.
+ */
+ private void offerService() throws Exception {
+ LOG.info("For namenode " + nnAddr + " using DELETEREPORT_INTERVAL of "
+ + dnConf.deleteReportInterval + " msec " + " BLOCKREPORT_INTERVAL of "
+ + dnConf.blockReportInterval + "msec" + " Initial delay: "
+ + dnConf.initialBlockReportDelay + "msec" + "; heartBeatInterval="
+ + dnConf.heartBeatInterval);
+
+ //
+ // Now loop for a long time....
+ //
+ while (shouldRun()) {
+ try {
+ long startTime = now();
+
+ //
+ // Every so often, send heartbeat or block-report
+ //
+ if (startTime - lastHeartbeat > dnConf.heartBeatInterval) {
+ //
+ // All heartbeat messages include following info:
+ // -- Datanode name
+ // -- data transfer port
+ // -- Total capacity
+ // -- Bytes remaining
+ //
+ lastHeartbeat = startTime;
+ if (!dn.areHeartbeatsDisabledForTests()) {
+ HeartbeatResponse resp = sendHeartBeat();
+ assert resp != null;
+ dn.getMetrics().addHeartbeat(now() - startTime);
+
+ // If the state of this NN has changed (eg STANDBY->ACTIVE)
+ // then let the BPOfferService update itself.
+ //
+ // Important that this happens before processCommand below,
+ // since the first heartbeat to a new active might have commands
+ // that we should actually process.
+ bpos.updateActorStatesFromHeartbeat(
+ this, resp.getNameNodeHaState());
+
+ long startProcessCommands = now();
+ if (!processCommand(resp.getCommands()))
+ continue;
+ long endProcessCommands = now();
+ if (endProcessCommands - startProcessCommands > 2000) {
+ LOG.info("Took " + (endProcessCommands - startProcessCommands)
+ + "ms to process " + resp.getCommands().length
+ + " commands from NN");
+ }
+ }
+ }
+ if (pendingReceivedRequests > 0
+ || (startTime - lastDeletedReport > dnConf.deleteReportInterval)) {
+ reportReceivedDeletedBlocks();
+ lastDeletedReport = startTime;
+ }
+
+ DatanodeCommand cmd = blockReport();
+ processCommand(new DatanodeCommand[]{ cmd });
+
+ // Now safe to start scanning the block pool.
+ // If it has already been started, this is a no-op.
+ if (dn.blockScanner != null) {
+ dn.blockScanner.addBlockPool(bpos.getBlockPoolId());
+ }
+
+ //
+ // There is no work to do; sleep until hearbeat timer elapses,
+ // or work arrives, and then iterate again.
+ //
+ long waitTime = dnConf.heartBeatInterval -
+ (System.currentTimeMillis() - lastHeartbeat);
+ synchronized(pendingIncrementalBR) {
+ if (waitTime > 0 && pendingReceivedRequests == 0) {
+ try {
+ pendingIncrementalBR.wait(waitTime);
+ } catch (InterruptedException ie) {
+ LOG.warn("BPOfferService for " + this + " interrupted");
+ }
+ }
+ } // synchronized
+ } catch(RemoteException re) {
+ String reClass = re.getClassName();
+ if (UnregisteredNodeException.class.getName().equals(reClass) ||
+ DisallowedDatanodeException.class.getName().equals(reClass) ||
+ IncorrectVersionException.class.getName().equals(reClass)) {
+ LOG.warn(this + " is shutting down", re);
+ shouldServiceRun = false;
+ return;
+ }
+ LOG.warn("RemoteException in offerService", re);
+ try {
+ long sleepTime = Math.min(1000, dnConf.heartBeatInterval);
+ Thread.sleep(sleepTime);
+ } catch (InterruptedException ie) {
+ Thread.currentThread().interrupt();
+ }
+ } catch (IOException e) {
+ LOG.warn("IOException in offerService", e);
+ }
+ } // while (shouldRun())
+ } // offerService
+
+ /**
+ * Register one bp with the corresponding NameNode
+ * <p>
+ * The bpDatanode needs to register with the namenode on startup in order
+ * 1) to report which storage it is serving now and
+ * 2) to receive a registrationID
+ *
+ * issued by the namenode to recognize registered datanodes.
+ *
+ * @see FSNamesystem#registerDatanode(DatanodeRegistration)
+ * @throws IOException
+ */
+ void register() throws IOException {
+ // The handshake() phase loaded the block pool storage
+ // off disk - so update the bpRegistration object from that info
+ bpRegistration = bpos.createRegistration();
+
+ LOG.info(this + " beginning handshake with NN");
+
+ while (shouldRun()) {
+ try {
+ // Use returned registration from namenode with updated machine name.
+ bpRegistration = bpNamenode.registerDatanode(bpRegistration,
+ new DatanodeStorage[0]);
+ break;
+ } catch(SocketTimeoutException e) { // namenode is busy
+ LOG.info("Problem connecting to server: " + nnAddr);
+ sleepAndLogInterrupts(1000, "connecting to server");
+ }
+ }
+
+ LOG.info("Block pool " + this + " successfully registered with NN");
+ bpos.registrationSucceeded(this, bpRegistration);
+
+ // random short delay - helps scatter the BR from all DNs
+ scheduleBlockReport(dnConf.initialBlockReportDelay);
+ }
+
+
+ private void sleepAndLogInterrupts(int millis,
+ String stateString) {
+ try {
+ Thread.sleep(millis);
+ } catch (InterruptedException ie) {
+ LOG.info("BPOfferService " + this +
+ " interrupted while " + stateString);
+ }
+ }
+
+ /**
+ * No matter what kind of exception we get, keep retrying to offerService().
+ * That's the loop that connects to the NameNode and provides basic DataNode
+ * functionality.
+ *
+ * Only stop when "shouldRun" or "shouldServiceRun" is turned off, which can
+ * happen either at shutdown or due to refreshNamenodes.
+ */
+ @Override
+ public void run() {
+ LOG.info(this + " starting to offer service");
+
+ try {
+ // init stuff
+ try {
+ // setup storage
+ connectToNNAndHandshake();
+ } catch (IOException ioe) {
+ // Initial handshake, storage recovery or registration failed
+ // End BPOfferService thread
+ LOG.fatal("Initialization failed for block pool " + this, ioe);
+ return;
+ }
+
+ initialized = true; // bp is initialized;
+
+ while (shouldRun()) {
+ try {
+ bpos.startDistributedUpgradeIfNeeded();
+ offerService();
+ } catch (Exception ex) {
+ LOG.error("Exception in BPOfferService for " + this, ex);
+ sleepAndLogInterrupts(5000, "offering service");
+ }
+ }
+ } catch (Throwable ex) {
+ LOG.warn("Unexpected exception in block pool " + this, ex);
+ } finally {
+ LOG.warn("Ending block pool service for: " + this);
+ cleanUp();
+ }
+ }
+
+ private boolean shouldRun() {
+ return shouldServiceRun && dn.shouldRun();
+ }
+
+ /**
+ * Process an array of datanode commands
+ *
+ * @param cmds an array of datanode commands
+ * @return true if further processing may be required or false otherwise.
+ */
+ boolean processCommand(DatanodeCommand[] cmds) {
+ if (cmds != null) {
+ for (DatanodeCommand cmd : cmds) {
+ try {
+ if (bpos.processCommandFromActor(cmd, this) == false) {
+ return false;
+ }
+ } catch (IOException ioe) {
+ LOG.warn("Error processing datanode Command", ioe);
+ }
+ }
+ }
+ return true;
+ }
+
+ void trySendErrorReport(int errCode, String errMsg) {
+ try {
+ bpNamenode.errorReport(bpRegistration, errCode, errMsg);
+ } catch(IOException e) {
+ LOG.warn("Error reporting an error to NameNode " + nnAddr,
+ e);
+ }
+ }
+
+ /**
+ * Report a bad block from another DN in this cluster.
+ */
+ void reportRemoteBadBlock(DatanodeInfo dnInfo, ExtendedBlock block)
+ throws IOException {
+ LocatedBlock lb = new LocatedBlock(block,
+ new DatanodeInfo[] {dnInfo});
+ bpNamenode.reportBadBlocks(new LocatedBlock[] {lb});
+ }
+
+ void reRegister() throws IOException {
+ if (shouldRun()) {
+ // re-retrieve namespace info to make sure that, if the NN
+ // was restarted, we still match its version (HDFS-2120)
+ retrieveNamespaceInfo();
+ // and re-register
+ register();
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java
new file mode 100644
index 0000000..3355ee2
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolManager.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.datanode;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+/**
+ * Manages the BPOfferService objects for the data node.
+ * Creation, removal, starting, stopping, shutdown on BPOfferService
+ * objects must be done via APIs in this class.
+ */
+@InterfaceAudience.Private
+class BlockPoolManager {
+ private static final Log LOG = DataNode.LOG;
+
+ private final Map<String, BPOfferService> bpByNameserviceId =
+ Maps.newHashMap();
+ private final Map<String, BPOfferService> bpByBlockPoolId =
+ Maps.newHashMap();
+ private final List<BPOfferService> offerServices =
+ Lists.newArrayList();
+
+ private final DataNode dn;
+
+ //This lock is used only to ensure exclusion of refreshNamenodes
+ private final Object refreshNamenodesLock = new Object();
+
+ BlockPoolManager(DataNode dn) {
+ this.dn = dn;
+ }
+
+ synchronized void addBlockPool(BPOfferService bpos) {
+ Preconditions.checkArgument(offerServices.contains(bpos),
+ "Unknown BPOS: %s", bpos);
+ if (bpos.getBlockPoolId() == null) {
+ throw new IllegalArgumentException("Null blockpool id");
+ }
+ bpByBlockPoolId.put(bpos.getBlockPoolId(), bpos);
+ }
+
+ /**
+ * Returns the array of BPOfferService objects.
+ * Caution: The BPOfferService returned could be shutdown any time.
+ */
+ synchronized BPOfferService[] getAllNamenodeThreads() {
+ BPOfferService[] bposArray = new BPOfferService[offerServices.size()];
+ return offerServices.toArray(bposArray);
+ }
+
+ synchronized BPOfferService get(String bpid) {
+ return bpByBlockPoolId.get(bpid);
+ }
+
+ synchronized void remove(BPOfferService t) {
+ offerServices.remove(t);
+ bpByBlockPoolId.remove(t.getBlockPoolId());
+
+ boolean removed = false;
+ for (Iterator<BPOfferService> it = bpByNameserviceId.values().iterator();
+ it.hasNext() && !removed;) {
+ BPOfferService bpos = it.next();
+ if (bpos == t) {
+ it.remove();
+ LOG.info("Removed " + bpos);
+ removed = true;
+ }
+ }
+
+ if (!removed) {
+ LOG.warn("Couldn't remove BPOS " + t + " from bpByNameserviceId map");
+ }
+ }
+
+ void shutDownAll() throws InterruptedException {
+ BPOfferService[] bposArray = this.getAllNamenodeThreads();
+
+ for (BPOfferService bpos : bposArray) {
+ bpos.stop(); //interrupts the threads
+ }
+ //now join
+ for (BPOfferService bpos : bposArray) {
+ bpos.join();
+ }
+ }
+
+ synchronized void startAll() throws IOException {
+ try {
+ UserGroupInformation.getLoginUser().doAs(
+ new PrivilegedExceptionAction<Object>() {
+ public Object run() throws Exception {
+ for (BPOfferService bpos : offerServices) {
+ bpos.start();
+ }
+ return null;
+ }
+ });
+ } catch (InterruptedException ex) {
+ IOException ioe = new IOException();
+ ioe.initCause(ex.getCause());
+ throw ioe;
+ }
+ }
+
+ void joinAll() {
+ for (BPOfferService bpos: this.getAllNamenodeThreads()) {
+ bpos.join();
+ }
+ }
+
+ void refreshNamenodes(Configuration conf)
+ throws IOException {
+ LOG.info("Refresh request received for nameservices: "
+ + conf.get(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES));
+
+ Map<String, Map<String, InetSocketAddress>> newAddressMap =
+ DFSUtil.getNNServiceRpcAddresses(conf);
+
+ synchronized (refreshNamenodesLock) {
+ doRefreshNamenodes(newAddressMap);
+ }
+ }
+
+ private void doRefreshNamenodes(
+ Map<String, Map<String, InetSocketAddress>> addrMap) throws IOException {
+ assert Thread.holdsLock(refreshNamenodesLock);
+
+ Set<String> toRefresh = Sets.newHashSet();
+ Set<String> toAdd = Sets.newHashSet();
+ Set<String> toRemove;
+
+ synchronized (this) {
+ // Step 1. For each of the new nameservices, figure out whether
+ // it's an update of the set of NNs for an existing NS,
+ // or an entirely new nameservice.
+ for (String nameserviceId : addrMap.keySet()) {
+ if (bpByNameserviceId.containsKey(nameserviceId)) {
+ toRefresh.add(nameserviceId);
+ } else {
+ toAdd.add(nameserviceId);
+ }
+ }
+
+ // Step 2. Any nameservices we currently have but are no longer present
+ // need to be removed.
+ toRemove = Sets.newHashSet(Sets.difference(
+ bpByNameserviceId.keySet(), addrMap.keySet()));
+
+ assert toRefresh.size() + toAdd.size() ==
+ addrMap.size() :
+ "toAdd: " + Joiner.on(",").useForNull("<default>").join(toAdd) +
+ " toRemove: " + Joiner.on(",").useForNull("<default>").join(toRemove) +
+ " toRefresh: " + Joiner.on(",").useForNull("<default>").join(toRefresh);
+
+
+ // Step 3. Start new nameservices
+ if (!toAdd.isEmpty()) {
+ LOG.info("Starting BPOfferServices for nameservices: " +
+ Joiner.on(",").useForNull("<default>").join(toAdd));
+
+ for (String nsToAdd : toAdd) {
+ ArrayList<InetSocketAddress> addrs =
+ Lists.newArrayList(addrMap.get(nsToAdd).values());
+ BPOfferService bpos = createBPOS(addrs);
+ bpByNameserviceId.put(nsToAdd, bpos);
+ offerServices.add(bpos);
+ }
+ }
+ startAll();
+ }
+
+ // Step 4. Shut down old nameservices. This happens outside
+ // of the synchronized(this) lock since they need to call
+ // back to .remove() from another thread
+ if (!toRemove.isEmpty()) {
+ LOG.info("Stopping BPOfferServices for nameservices: " +
+ Joiner.on(",").useForNull("<default>").join(toRemove));
+
+ for (String nsToRemove : toRemove) {
+ BPOfferService bpos = bpByNameserviceId.get(nsToRemove);
+ bpos.stop();
+ bpos.join();
+ // they will call remove on their own
+ }
+ }
+
+ // Step 5. Update nameservices whose NN list has changed
+ if (!toRefresh.isEmpty()) {
+ LOG.info("Refreshing list of NNs for nameservices: " +
+ Joiner.on(",").useForNull("<default>").join(toRefresh));
+
+ for (String nsToRefresh : toRefresh) {
+ BPOfferService bpos = bpByNameserviceId.get(nsToRefresh);
+ ArrayList<InetSocketAddress> addrs =
+ Lists.newArrayList(addrMap.get(nsToRefresh).values());
+ bpos.refreshNNList(addrs);
+ }
+ }
+ }
+
+ /**
+ * Extracted out for test purposes.
+ */
+ protected BPOfferService createBPOS(List<InetSocketAddress> nnAddrs) {
+ return new BPOfferService(nnAddrs, dn);
+ }
+}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
index 153fd93..fd25c1d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java
@@ -153,6 +153,7 @@
switch (stage) {
case PIPELINE_SETUP_CREATE:
replicaInfo = datanode.data.createRbw(block);
+ datanode.notifyNamenodeReceivingBlock(block);
break;
case PIPELINE_SETUP_STREAMING_RECOVERY:
replicaInfo = datanode.data.recoverRbw(
@@ -166,6 +167,7 @@
block.getLocalBlock());
}
block.setGenerationStamp(newGs);
+ datanode.notifyNamenodeReceivingBlock(block);
break;
case PIPELINE_SETUP_APPEND_RECOVERY:
replicaInfo = datanode.data.recoverAppend(block, newGs, minBytesRcvd);
@@ -174,6 +176,7 @@
block.getLocalBlock());
}
block.setGenerationStamp(newGs);
+ datanode.notifyNamenodeReceivingBlock(block);
break;
case TRANSFER_RBW:
case TRANSFER_FINALIZED:
@@ -320,7 +323,6 @@
private void verifyChunks( byte[] dataBuf, int dataOff, int len,
byte[] checksumBuf, int checksumOff )
throws IOException {
- DatanodeProtocol nn = datanode.getBPNamenode(block.getBlockPoolId());
while (len > 0) {
int chunkLen = Math.min(len, bytesPerChecksum);
@@ -331,9 +333,7 @@
try {
LOG.info("report corrupt block " + block + " from datanode " +
srcDataNode + " to namenode");
- LocatedBlock lb = new LocatedBlock(block,
- new DatanodeInfo[] {srcDataNode});
- nn.reportBadBlocks(new LocatedBlock[] {lb});
+ datanode.reportRemoteBadBlock(srcDataNode, block);
} catch (IOException e) {
LOG.warn("Failed to report bad block " + block +
" from datanode " + srcDataNode + " to namenode");
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
index 5681525..098809c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
@@ -45,7 +45,6 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_STARTUP_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_USER_NAME_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_FEDERATION_NAMESERVICES;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HTTPS_ENABLE_KEY;
import java.io.BufferedOutputStream;
@@ -86,6 +85,7 @@
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress;
import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.protocol.Block;
@@ -164,6 +164,8 @@
import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Sets;
import com.google.protobuf.BlockingService;
@@ -230,143 +232,6 @@
return NetUtils.createSocketAddr(target);
}
- /**
- * Manages he BPOfferService objects for the data node.
- * Creation, removal, starting, stopping, shutdown on BPOfferService
- * objects must be done via APIs in this class.
- */
- @InterfaceAudience.Private
- class BlockPoolManager {
- private final Map<String, BPOfferService> bpMapping;
- private final Map<InetSocketAddress, BPOfferService> nameNodeThreads;
-
- //This lock is used only to ensure exclusion of refreshNamenodes
- private final Object refreshNamenodesLock = new Object();
-
- BlockPoolManager(Configuration conf)
- throws IOException {
- bpMapping = new HashMap<String, BPOfferService>();
- nameNodeThreads = new HashMap<InetSocketAddress, BPOfferService>();
-
- List<InetSocketAddress> isas = DFSUtil.getNNServiceRpcAddresses(conf);
- for(InetSocketAddress isa : isas) {
- BPOfferService bpos = new BPOfferService(isa, DataNode.this);
- nameNodeThreads.put(bpos.getNNSocketAddress(), bpos);
- }
- }
-
- synchronized void addBlockPool(BPOfferService t) {
- if (nameNodeThreads.get(t.getNNSocketAddress()) == null) {
- throw new IllegalArgumentException(
- "Unknown BPOfferService thread for namenode address:"
- + t.getNNSocketAddress());
- }
- if (t.getBlockPoolId() == null) {
- throw new IllegalArgumentException("Null blockpool id");
- }
- bpMapping.put(t.getBlockPoolId(), t);
- }
-
- /**
- * Returns the array of BPOfferService objects.
- * Caution: The BPOfferService returned could be shutdown any time.
- */
- synchronized BPOfferService[] getAllNamenodeThreads() {
- BPOfferService[] bposArray = new BPOfferService[nameNodeThreads.values()
- .size()];
- return nameNodeThreads.values().toArray(bposArray);
- }
-
- synchronized BPOfferService get(InetSocketAddress addr) {
- return nameNodeThreads.get(addr);
- }
-
- synchronized BPOfferService get(String bpid) {
- return bpMapping.get(bpid);
- }
-
- synchronized void remove(BPOfferService t) {
- nameNodeThreads.remove(t.getNNSocketAddress());
- bpMapping.remove(t.getBlockPoolId());
- }
-
- void shutDownAll() throws InterruptedException {
- BPOfferService[] bposArray = this.getAllNamenodeThreads();
-
- for (BPOfferService bpos : bposArray) {
- bpos.stop(); //interrupts the threads
- }
- //now join
- for (BPOfferService bpos : bposArray) {
- bpos.join();
- }
- }
-
- synchronized void startAll() throws IOException {
- try {
- UserGroupInformation.getLoginUser().doAs(
- new PrivilegedExceptionAction<Object>() {
- public Object run() throws Exception {
- for (BPOfferService bpos : nameNodeThreads.values()) {
- bpos.start();
- }
- return null;
- }
- });
- } catch (InterruptedException ex) {
- IOException ioe = new IOException();
- ioe.initCause(ex.getCause());
- throw ioe;
- }
- }
-
- void joinAll() {
- for (BPOfferService bpos: this.getAllNamenodeThreads()) {
- bpos.join();
- }
- }
-
- void refreshNamenodes(Configuration conf)
- throws IOException {
- LOG.info("Refresh request received for nameservices: "
- + conf.get(DFS_FEDERATION_NAMESERVICES));
- List<InetSocketAddress> newAddresses =
- DFSUtil.getNNServiceRpcAddresses(conf);
- List<BPOfferService> toShutdown = new ArrayList<BPOfferService>();
- List<InetSocketAddress> toStart = new ArrayList<InetSocketAddress>();
- synchronized (refreshNamenodesLock) {
- synchronized (this) {
- for (InetSocketAddress nnaddr : nameNodeThreads.keySet()) {
- if (!(newAddresses.contains(nnaddr))) {
- toShutdown.add(nameNodeThreads.get(nnaddr));
- }
- }
- for (InetSocketAddress nnaddr : newAddresses) {
- if (!(nameNodeThreads.containsKey(nnaddr))) {
- toStart.add(nnaddr);
- }
- }
-
- for (InetSocketAddress nnaddr : toStart) {
- BPOfferService bpos = new BPOfferService(nnaddr, DataNode.this);
- nameNodeThreads.put(bpos.getNNSocketAddress(), bpos);
- }
- }
-
- for (BPOfferService bpos : toShutdown) {
- bpos.stop();
- bpos.join();
- }
-
- // stoping the BPOSes causes them to call remove() on their own when they
- // clean up.
-
- // Now start the threads that are not already running.
- startAll();
- }
- }
- }
-
volatile boolean shouldRun = true;
private BlockPoolManager blockPoolManager;
public volatile FSDatasetInterface<? extends FSVolumeInterface> data = null;
@@ -653,7 +518,18 @@
if(bpos != null) {
bpos.notifyNamenodeReceivedBlock(block, delHint);
} else {
- LOG.warn("Cannot find BPOfferService for reporting block received for bpid="
+ LOG.error("Cannot find BPOfferService for reporting block received for bpid="
+ + block.getBlockPoolId());
+ }
+ }
+
+ // calls specific to BP
+ protected void notifyNamenodeReceivingBlock(ExtendedBlock block) {
+ BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
+ if(bpos != null) {
+ bpos.notifyNamenodeReceivingBlock(block);
+ } else {
+ LOG.error("Cannot find BPOfferService for reporting block receiving for bpid="
+ block.getBlockPoolId());
}
}
@@ -664,18 +540,66 @@
if (bpos != null) {
bpos.notifyNamenodeDeletedBlock(block);
} else {
- LOG.warn("Cannot find BPOfferService for reporting block deleted for bpid="
+ LOG.error("Cannot find BPOfferService for reporting block deleted for bpid="
+ block.getBlockPoolId());
}
}
+ /**
+ * Report a bad block which is hosted on the local DN.
+ */
public void reportBadBlocks(ExtendedBlock block) throws IOException{
- BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
- if(bpos == null || bpos.bpNamenode == null) {
- throw new IOException("cannot locate OfferService thread for bp="+block.getBlockPoolId());
- }
+ BPOfferService bpos = getBPOSForBlock(block);
bpos.reportBadBlocks(block);
}
+
+ /**
+ * Report a bad block on another DN (eg if we received a corrupt replica
+ * from a remote host).
+ * @param srcDataNode the DN hosting the bad block
+ * @param block the block itself
+ */
+ public void reportRemoteBadBlock(DatanodeInfo srcDataNode, ExtendedBlock block)
+ throws IOException {
+ BPOfferService bpos = getBPOSForBlock(block);
+ bpos.reportRemoteBadBlock(srcDataNode, block);
+ }
+
+ /**
+ * Try to send an error report to the NNs associated with the given
+ * block pool.
+ * @param bpid the block pool ID
+ * @param errCode error code to send
+ * @param errMsg textual message to send
+ */
+ void trySendErrorReport(String bpid, int errCode, String errMsg) {
+ BPOfferService bpos = blockPoolManager.get(bpid);
+ if (bpos == null) {
+ throw new IllegalArgumentException("Bad block pool: " + bpid);
+ }
+ bpos.trySendErrorReport(errCode, errMsg);
+ }
+
+
+
+ /**
+ * Return the BPOfferService instance corresponding to the given block.
+ * @param block
+ * @return the BPOS
+ * @throws IOException if no such BPOS can be found
+ */
+ private BPOfferService getBPOSForBlock(ExtendedBlock block)
+ throws IOException {
+ Preconditions.checkNotNull(block);
+ BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
+ if (bpos == null) {
+ throw new IOException("cannot locate OfferService thread for bp="+
+ block.getBlockPoolId());
+ }
+ return bpos;
+ }
+
+
// used only for testing
void setHeartbeatsDisabledForTests(
@@ -728,7 +652,8 @@
metrics = DataNodeMetrics.create(conf, getMachineName());
- blockPoolManager = new BlockPoolManager(conf);
+ blockPoolManager = new BlockPoolManager(this);
+ blockPoolManager.refreshNamenodes(conf);
}
/**
@@ -961,11 +886,15 @@
/**
* get BP registration by machine and port name (host:port)
- * @param mName
+ * @param mName - the name that the NN used
* @return BP registration
* @throws IOException
*/
DatanodeRegistration getDNRegistrationByMachineName(String mName) {
+ // TODO: all the BPs should have the same name as each other, they all come
+ // from getName() here! and the use cases only are in tests where they just
+ // call with getName(). So we could probably just make this method return
+ // the first BPOS's registration. See HDFS-2609.
BPOfferService [] bposArray = blockPoolManager.getAllNamenodeThreads();
for (BPOfferService bpos : bposArray) {
if(bpos.bpRegistration.getName().equals(mName))
@@ -1011,20 +940,6 @@
throw new IOException(ie.getMessage());
}
}
-
- /**
- * get the name node address based on the block pool id
- * @param bpid block pool ID
- * @return namenode address corresponding to the bpid
- */
- public InetSocketAddress getNameNodeAddr(String bpid) {
- BPOfferService bp = blockPoolManager.get(bpid);
- if (bp != null) {
- return bp.getNNSocketAddress();
- }
- LOG.warn("No name node address found for block pool ID " + bpid);
- return null;
- }
public InetSocketAddress getSelfAddr() {
return selfAddr;
@@ -1251,12 +1166,7 @@
//inform NameNodes
for(BPOfferService bpos: blockPoolManager.getAllNamenodeThreads()) {
- DatanodeProtocolClientSideTranslatorPB nn = bpos.bpNamenode;
- try {
- nn.errorReport(bpos.bpRegistration, dpError, errMsgr);
- } catch(IOException e) {
- LOG.warn("Error reporting disk failure to NameNode", e);
- }
+ bpos.trySendErrorReport(dpError, errMsgr);
}
if(hasEnoughResources) {
@@ -1273,6 +1183,10 @@
public int getXceiverCount() {
return threadGroup == null ? 0 : threadGroup.activeCount();
}
+
+ int getXmitsInProgress() {
+ return xmitsInProgress.get();
+ }
UpgradeManagerDatanode getUpgradeManagerDatanode(String bpid) {
BPOfferService bpos = blockPoolManager.get(bpid);
@@ -1285,15 +1199,15 @@
private void transferBlock( ExtendedBlock block,
DatanodeInfo xferTargets[]
) throws IOException {
- DatanodeProtocolClientSideTranslatorPB nn = getBPNamenode(block
- .getBlockPoolId());
+ BPOfferService bpos = getBPOSForBlock(block);
DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId());
if (!data.isValidBlock(block)) {
// block does not exist or is under-construction
String errStr = "Can't send invalid block " + block;
LOG.info(errStr);
- nn.errorReport(bpReg, DatanodeProtocol.INVALID_BLOCK, errStr);
+
+ bpos.trySendErrorReport(DatanodeProtocol.INVALID_BLOCK, errStr);
return;
}
@@ -1301,9 +1215,7 @@
long onDiskLength = data.getLength(block);
if (block.getNumBytes() > onDiskLength) {
// Shorter on-disk len indicates corruption so report NN the corrupt block
- nn.reportBadBlocks(new LocatedBlock[]{
- new LocatedBlock(block, new DatanodeInfo[] {
- new DatanodeInfo(bpReg)})});
+ bpos.reportBadBlocks(block);
LOG.warn("Can't replicate block " + block
+ " because on-disk length " + onDiskLength
+ " is shorter than NameNode recorded length " + block.getNumBytes());
@@ -1861,6 +1773,13 @@
long newLength) throws IOException {
ReplicaInfo r = data.updateReplicaUnderRecovery(oldBlock,
recoveryId, newLength);
+ // Notify the namenode of the updated block info. This is important
+ // for HA, since otherwise the standby node may lose track of the
+ // block locations until the next block report.
+ ExtendedBlock newBlock = new ExtendedBlock(oldBlock);
+ newBlock.setGenerationStamp(recoveryId);
+ newBlock.setNumBytes(newLength);
+ notifyNamenodeReceivedBlock(newBlock, "");
return new ExtendedBlock(oldBlock.getBlockPoolId(), r);
}
@@ -1935,23 +1854,32 @@
* @return Namenode corresponding to the bpid
* @throws IOException
*/
- public DatanodeProtocolClientSideTranslatorPB getBPNamenode(String bpid)
+ public DatanodeProtocolClientSideTranslatorPB getActiveNamenodeForBP(String bpid)
throws IOException {
BPOfferService bpos = blockPoolManager.get(bpid);
if (bpos == null) {
throw new IOException("No block pool offer service for bpid=" + bpid);
- } else if (bpos.bpNamenode == null) {
- throw new IOException("cannot find a namenode proxy for bpid=" + bpid);
}
- return bpos.bpNamenode;
+
+ DatanodeProtocolClientSideTranslatorPB activeNN = bpos.getActiveNN();
+ if (activeNN == null) {
+ throw new IOException(
+ "Block pool " + bpid + " has not recognized an active NN");
+ }
+ return activeNN;
}
/** Block synchronization */
void syncBlock(RecoveringBlock rBlock,
List<BlockRecord> syncList) throws IOException {
ExtendedBlock block = rBlock.getBlock();
- DatanodeProtocolClientSideTranslatorPB nn = getBPNamenode(block
- .getBlockPoolId());
+ DatanodeProtocolClientSideTranslatorPB nn =
+ getActiveNamenodeForBP(block.getBlockPoolId());
+ if (nn == null) {
+ throw new IOException(
+ "Unable to synchronize block " + rBlock + ", since this DN "
+ + " has not acknowledged any NN as active.");
+ }
long recoveryId = rBlock.getNewGenerationStamp();
if (LOG.isDebugEnabled()) {
@@ -2172,14 +2100,19 @@
/**
* Returned information is a JSON representation of a map with
- * name node host name as the key and block pool Id as the value
+ * name node host name as the key and block pool Id as the value.
+ * Note that, if there are multiple NNs in an NA nameservice,
+ * a given block pool may be represented twice.
*/
@Override // DataNodeMXBean
public String getNamenodeAddresses() {
final Map<String, String> info = new HashMap<String, String>();
for (BPOfferService bpos : blockPoolManager.getAllNamenodeThreads()) {
- if (bpos != null && bpos.bpThread != null) {
- info.put(bpos.getNNSocketAddress().getHostName(), bpos.getBlockPoolId());
+ if (bpos != null) {
+ for (BPServiceActor actor : bpos.getBPServiceActors()) {
+ info.put(actor.getNNSocketAddress().getHostName(),
+ bpos.getBlockPoolId());
+ }
}
}
return JSON.toString(info);
@@ -2228,11 +2161,18 @@
/**
* @param addr rpc address of the namenode
- * @return true - if BPOfferService corresponding to the namenode is alive
+ * @return true if the datanode is connected to a NameNode at the
+ * given address
*/
- public boolean isBPServiceAlive(InetSocketAddress addr) {
- BPOfferService bp = blockPoolManager.get(addr);
- return bp != null ? bp.isAlive() : false;
+ public boolean isConnectedToNN(InetSocketAddress addr) {
+ for (BPOfferService bpos : getAllBpOs()) {
+ for (BPServiceActor bpsa : bpos.getBPServiceActors()) {
+ if (addr.equals(bpsa.getNNSocketAddress())) {
+ return bpsa.isAlive();
+ }
+ }
+ }
+ return false;
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java
index 408a6af..89272b2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FSDatasetAsyncDiskService.java
@@ -107,6 +107,14 @@
}
+ synchronized long countPendingDeletions() {
+ long count = 0;
+ for (ThreadPoolExecutor exec : executors.values()) {
+ count += exec.getTaskCount() - exec.getCompletedTaskCount();
+ }
+ return count;
+ }
+
/**
* Execute the task sometime in the future, using ThreadPools.
*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java
index 478fb56..9ada40fd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeManagerDatanode.java
@@ -92,7 +92,7 @@
"UpgradeManagerDatanode.currentUpgrades is not null.";
assert upgradeDaemon == null :
"UpgradeManagerDatanode.upgradeDaemon is not null.";
- DatanodeProtocol nn = dataNode.getBPNamenode(bpid);
+ DatanodeProtocol nn = dataNode.getActiveNamenodeForBP(bpid);
nn.processUpgradeCommand(broadcastCommand);
return true;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java
index ddb1d60..49d2621 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/UpgradeObjectDatanode.java
@@ -45,7 +45,7 @@
}
protected DatanodeProtocol getNamenode() throws IOException {
- return dataNode.getBPNamenode(bpid);
+ return dataNode.getActiveNamenodeForBP(bpid);
}
void setDatanode(DataNode dataNode, String bpid) {
@@ -92,14 +92,7 @@
+ " Name-node version = " + nsInfo.getLayoutVersion() + ".";
DataNode.LOG.fatal( errorMsg );
String bpid = nsInfo.getBlockPoolID();
- DatanodeProtocol nn = dataNode.getBPNamenode(bpid);
- try {
- nn.errorReport(dataNode.getDNRegistrationForBP(bpid),
- DatanodeProtocol.NOTIFY, errorMsg);
- } catch(SocketTimeoutException e) { // namenode is busy
- DataNode.LOG.info("Problem connecting to server: "
- + dataNode.getNameNodeAddr(nsInfo.getBlockPoolID()));
- }
+ dataNode.trySendErrorReport(bpid, DatanodeProtocol.NOTIFY, errorMsg);
throw new IOException(errorMsg);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
index fc1fe14..ece013f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
@@ -217,7 +217,7 @@
int logVersion = storage.getLayoutVersion();
backupInputStream.setBytes(data, logVersion);
- int numLoaded = logLoader.loadEditRecords(logVersion, backupInputStream,
+ long numLoaded = logLoader.loadEditRecords(logVersion, backupInputStream,
true, lastAppliedTxId + 1);
if (numLoaded != numTxns) {
throw new IOException("Batch of txns starting at txnid " +
@@ -310,7 +310,7 @@
+ " txns from in-progress stream " + stream);
FSEditLogLoader loader = new FSEditLogLoader(namesystem);
- int numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1);
+ long numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1);
lastAppliedTxId += numLoaded;
assert numLoaded == remainingTxns :
"expected to load " + remainingTxns + " but loaded " +
@@ -345,7 +345,7 @@
synchronized void namenodeStartedLogSegment(long txid)
throws IOException {
LOG.info("NameNode started a new log segment at txid " + txid);
- if (editLog.isOpen()) {
+ if (editLog.isSegmentOpen()) {
if (editLog.getLastWrittenTxId() == txid - 1) {
// We are in sync with the NN, so end and finalize the current segment
editLog.endCurrentLogSegment(false);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java
index c655ee7..de75b76 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java
@@ -58,7 +58,7 @@
}
@Override
- public long getNumberOfTransactions(long fromTxnId)
+ public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk)
throws IOException, CorruptionException {
// This JournalManager is never used for input. Therefore it cannot
// return any transactions
@@ -66,7 +66,8 @@
}
@Override
- public EditLogInputStream getInputStream(long fromTxnId) throws IOException {
+ public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk)
+ throws IOException {
// This JournalManager is never used for input. Therefore it cannot
// return any transactions
throw new IOException("Unsupported operation");
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
index 0515470..9cad4eb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
@@ -26,13 +26,13 @@
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalProtocolService;
import org.apache.hadoop.hdfs.protocolPB.JournalProtocolPB;
import org.apache.hadoop.hdfs.protocolPB.JournalProtocolServerSideTranslatorPB;
-import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
@@ -41,7 +41,8 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
-import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.ipc.StandbyException;
+import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
@@ -69,7 +70,7 @@
private static final String BN_SERVICE_RPC_ADDRESS_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_SERVICE_RPC_ADDRESS_KEY;
/** Name-node proxy */
- NamenodeProtocolTranslatorPB namenode;
+ NamenodeProtocol namenode;
/** Name-node RPC address */
String nnRpcAddress;
/** Name-node HTTP address */
@@ -89,13 +90,13 @@
// Common NameNode methods implementation for backup node.
/////////////////////////////////////////////////////
@Override // NameNode
- protected InetSocketAddress getRpcServerAddress(Configuration conf) throws IOException {
+ protected InetSocketAddress getRpcServerAddress(Configuration conf) {
String addr = conf.get(BN_ADDRESS_NAME_KEY, BN_ADDRESS_DEFAULT);
return NetUtils.createSocketAddr(addr);
}
@Override
- protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) throws IOException {
+ protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) {
String addr = conf.get(BN_SERVICE_RPC_ADDRESS_KEY);
if (addr == null || addr.isEmpty()) {
return null;
@@ -143,6 +144,7 @@
CommonConfigurationKeys.FS_TRASH_INTERVAL_DEFAULT);
NamespaceInfo nsInfo = handshake(conf);
super.initialize(conf);
+
if (false == namesystem.isInSafeMode()) {
namesystem.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
}
@@ -189,7 +191,7 @@
}
// Stop the RPC client
if (namenode != null) {
- IOUtils.cleanup(LOG, namenode);
+ RPC.stopProxy(namenode);
}
namenode = null;
// Stop the checkpoint manager
@@ -197,6 +199,11 @@
checkpointManager.interrupt();
checkpointManager = null;
}
+
+ // Abort current log segment - otherwise the NN shutdown code
+ // will close it gracefully, which is incorrect.
+ getFSImage().getEditLog().abortCurrentLogSegment();
+
// Stop name-node threads
super.stop();
}
@@ -221,58 +228,31 @@
this.clientRpcServer);
nnRpcAddress = nn.nnRpcAddress;
}
-
- /////////////////////////////////////////////////////
- // NamenodeProtocol implementation for backup node.
- /////////////////////////////////////////////////////
- @Override // NamenodeProtocol
- public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size)
- throws IOException {
- throw new UnsupportedActionException("getBlocks");
- }
-
- // Only active name-node can register other nodes.
- @Override // NamenodeProtocol
- public NamenodeRegistration register(NamenodeRegistration registration
- ) throws IOException {
- throw new UnsupportedActionException("register");
- }
-
- @Override // NamenodeProtocol
- public NamenodeCommand startCheckpoint(NamenodeRegistration registration)
- throws IOException {
- throw new UnsupportedActionException("startCheckpoint");
- }
-
- @Override // NamenodeProtocol
- public void endCheckpoint(NamenodeRegistration registration,
- CheckpointSignature sig) throws IOException {
- throw new UnsupportedActionException("endCheckpoint");
- }
-
+
/////////////////////////////////////////////////////
// BackupNodeProtocol implementation for backup node.
/////////////////////////////////////////////////////
-
+ @Override
+ public void startLogSegment(NamenodeRegistration registration, long txid)
+ throws IOException {
+ namesystem.checkOperation(OperationCategory.JOURNAL);
+ verifyRequest(registration);
+
+ getBNImage().namenodeStartedLogSegment(txid);
+ }
+
@Override
public void journal(NamenodeRegistration nnReg,
long firstTxId, int numTxns,
byte[] records) throws IOException {
+ namesystem.checkOperation(OperationCategory.JOURNAL);
verifyRequest(nnReg);
if(!nnRpcAddress.equals(nnReg.getAddress()))
throw new IOException("Journal request from unexpected name-node: "
- + nnReg.getAddress() + " expecting " + clientRpcAddress);
+ + nnReg.getAddress() + " expecting " + nnRpcAddress);
getBNImage().journal(firstTxId, numTxns, records);
}
-
- @Override
- public void startLogSegment(NamenodeRegistration registration, long txid)
- throws IOException {
- verifyRequest(registration);
-
- getBNImage().namenodeStartedLogSegment(txid);
- }
-
+
private BackupImage getBNImage() {
return (BackupImage)nn.getFSImage();
}
@@ -295,8 +275,9 @@
private NamespaceInfo handshake(Configuration conf) throws IOException {
// connect to name node
InetSocketAddress nnAddress = NameNode.getServiceAddress(conf, true);
- this.namenode = new NamenodeProtocolTranslatorPB(nnAddress, conf,
- UserGroupInformation.getCurrentUser());
+ this.namenode = NameNodeProxies.createNonHAProxy(conf, nnAddress,
+ NamenodeProtocol.class, UserGroupInformation.getCurrentUser(),
+ true).getProxy();
this.nnRpcAddress = NetUtils.getHostPortString(nnAddress);
this.nnHttpAddress = NetUtils.getHostPortString(super.getHttpServerAddress(conf));
// get version and id info from the name-node
@@ -409,6 +390,28 @@
}
@Override
+ protected NameNodeHAContext createHAContext() {
+ return new BNHAContext();
+ }
+
+ private class BNHAContext extends NameNodeHAContext {
+ @Override // NameNode
+ public void checkOperation(OperationCategory op)
+ throws StandbyException {
+ if (op == OperationCategory.UNCHECKED ||
+ op == OperationCategory.CHECKPOINT) {
+ return;
+ }
+ if (OperationCategory.JOURNAL != op &&
+ !(OperationCategory.READ == op && allowStaleStandbyReads)) {
+ String msg = "Operation category " + op
+ + " is not supported at the BackupNode";
+ throw new StandbyException(msg);
+ }
+ }
+ }
+
+ @Override
protected String getNameServiceId(Configuration conf) {
return DFSUtil.getBackupNameServiceId(conf);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java
new file mode 100644
index 0000000..8b3cf04
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointConf.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.ImmutableList;
+
+@InterfaceAudience.Private
+public class CheckpointConf {
+ private static final Log LOG = LogFactory.getLog(CheckpointConf.class);
+
+ /** How often to checkpoint regardless of number of txns */
+ private final long checkpointPeriod; // in seconds
+
+ /** How often to poll the NN to check checkpointTxnCount */
+ private final long checkpointCheckPeriod; // in seconds
+
+ /** checkpoint once every this many transactions, regardless of time */
+ private final long checkpointTxnCount;
+
+
+ public CheckpointConf(Configuration conf) {
+ checkpointCheckPeriod = conf.getLong(
+ DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY,
+ DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT);
+
+ checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
+ DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
+ checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
+ DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
+ warnForDeprecatedConfigs(conf);
+ }
+
+ private static void warnForDeprecatedConfigs(Configuration conf) {
+ for (String key : ImmutableList.of(
+ "fs.checkpoint.size",
+ "dfs.namenode.checkpoint.size")) {
+ if (conf.get(key) != null) {
+ LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." +
+ " Instead please specify a value for " +
+ DFS_NAMENODE_CHECKPOINT_TXNS_KEY);
+ }
+ }
+ }
+
+ public long getPeriod() {
+ return checkpointPeriod;
+ }
+
+ public long getCheckPeriod() {
+ return Math.min(checkpointCheckPeriod, checkpointPeriod);
+ }
+
+ public long getTxnCount() {
+ return checkpointTxnCount;
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
index 39d2abae..6ae931f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
@@ -29,7 +29,6 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
@@ -58,17 +57,16 @@
private BackupNode backupNode;
volatile boolean shouldRun;
- private long checkpointPeriod; // in seconds
- // Transactions count to trigger the checkpoint
- private long checkpointTxnCount;
private String infoBindAddress;
+ private CheckpointConf checkpointConf;
+
private BackupImage getFSImage() {
return (BackupImage)backupNode.getFSImage();
}
- private NamenodeProtocol getNamenode(){
+ private NamenodeProtocol getRemoteNamenodeProxy(){
return backupNode.namenode;
}
@@ -89,26 +87,24 @@
/**
* Initialize checkpoint.
*/
- @SuppressWarnings("deprecation")
private void initialize(Configuration conf) throws IOException {
// Create connection to the namenode.
shouldRun = true;
// Initialize other scheduling parameters from the configuration
- checkpointPeriod = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
- DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
- checkpointTxnCount = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
- DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
- SecondaryNameNode.warnForDeprecatedConfigs(conf);
+ checkpointConf = new CheckpointConf(conf);
// Pull out exact http address for posting url to avoid ip aliasing issues
String fullInfoAddr = conf.get(DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY,
DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT);
infoBindAddress = fullInfoAddr.substring(0, fullInfoAddr.indexOf(":"));
- LOG.info("Checkpoint Period : " + checkpointPeriod + " secs " +
- "(" + checkpointPeriod/60 + " min)");
- LOG.info("Transactions count is : " + checkpointTxnCount + ", to trigger checkpoint");
+ LOG.info("Checkpoint Period : " +
+ checkpointConf.getPeriod() + " secs " +
+ "(" + checkpointConf.getPeriod()/60 + " min)");
+ LOG.info("Transactions count is : " +
+ checkpointConf.getTxnCount() +
+ ", to trigger checkpoint");
}
/**
@@ -125,8 +121,8 @@
public void run() {
// Check the size of the edit log once every 5 minutes.
long periodMSec = 5 * 60; // 5 minutes
- if(checkpointPeriod < periodMSec) {
- periodMSec = checkpointPeriod;
+ if(checkpointConf.getPeriod() < periodMSec) {
+ periodMSec = checkpointConf.getPeriod();
}
periodMSec *= 1000;
@@ -142,7 +138,7 @@
shouldCheckpoint = true;
} else {
long txns = countUncheckpointedTxns();
- if(txns >= checkpointTxnCount)
+ if(txns >= checkpointConf.getTxnCount())
shouldCheckpoint = true;
}
if(shouldCheckpoint) {
@@ -165,7 +161,7 @@
}
private long countUncheckpointedTxns() throws IOException {
- long curTxId = getNamenode().getTransactionID();
+ long curTxId = getRemoteNamenodeProxy().getTransactionID();
long uncheckpointedTxns = curTxId -
getFSImage().getStorage().getMostRecentCheckpointTxId();
assert uncheckpointedTxns >= 0;
@@ -183,7 +179,7 @@
bnImage.freezeNamespaceAtNextRoll();
NamenodeCommand cmd =
- getNamenode().startCheckpoint(backupNode.getRegistration());
+ getRemoteNamenodeProxy().startCheckpoint(backupNode.getRegistration());
CheckpointCommand cpCmd = null;
switch(cmd.getAction()) {
case NamenodeProtocol.ACT_SHUTDOWN:
@@ -207,7 +203,7 @@
long lastApplied = bnImage.getLastAppliedTxId();
LOG.debug("Doing checkpoint. Last applied: " + lastApplied);
RemoteEditLogManifest manifest =
- getNamenode().getEditLogManifest(bnImage.getLastAppliedTxId() + 1);
+ getRemoteNamenodeProxy().getEditLogManifest(bnImage.getLastAppliedTxId() + 1);
if (!manifest.getLogs().isEmpty()) {
RemoteEditLog firstRemoteLog = manifest.getLogs().get(0);
@@ -243,11 +239,16 @@
long txid = bnImage.getLastAppliedTxId();
- backupNode.namesystem.dir.setReady();
- backupNode.namesystem.setBlockTotal();
-
- bnImage.saveFSImageInAllDirs(backupNode.getNamesystem(), txid);
- bnStorage.writeAll();
+ backupNode.namesystem.writeLock();
+ try {
+ backupNode.namesystem.dir.setReady();
+ backupNode.namesystem.setBlockTotal();
+
+ bnImage.saveFSImageInAllDirs(backupNode.getNamesystem(), txid);
+ bnStorage.writeAll();
+ } finally {
+ backupNode.namesystem.writeUnlock();
+ }
if(cpCmd.needToReturnImage()) {
TransferFsImage.uploadImageFromStorage(
@@ -255,7 +256,7 @@
bnStorage, txid);
}
- getNamenode().endCheckpoint(backupNode.getRegistration(), sig);
+ getRemoteNamenodeProxy().endCheckpoint(backupNode.getRegistration(), sig);
if (backupNode.getRole() == NamenodeRole.BACKUP) {
bnImage.convergeJournalSpool();
@@ -286,7 +287,7 @@
log.getStartTxId(), log.getEndTxId());
if (log.getStartTxId() > dstImage.getLastAppliedTxId()) {
editsStreams.add(new EditLogFileInputStream(f, log.getStartTxId(),
- log.getEndTxId()));
+ log.getEndTxId(), true));
}
}
LOG.info("Checkpointer about to load edits from " +
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java
index 10601b1..3ffc852 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java
@@ -39,6 +39,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo.AdminStates;
import org.apache.hadoop.util.StringUtils;
import org.codehaus.jackson.JsonNode;
@@ -66,9 +67,10 @@
ClusterStatus generateClusterHealthReport() {
ClusterStatus cs = new ClusterStatus();
Configuration conf = new Configuration();
- List<InetSocketAddress> isas = null;
+ List<ConfiguredNNAddress> nns = null;
try {
- isas = DFSUtil.getNNServiceRpcAddresses(conf);
+ nns = DFSUtil.flattenAddressMap(
+ DFSUtil.getNNServiceRpcAddresses(conf));
} catch (Exception e) {
// Could not build cluster status
cs.setError(e);
@@ -76,7 +78,8 @@
}
// Process each namenode and add it to ClusterStatus
- for (InetSocketAddress isa : isas) {
+ for (ConfiguredNNAddress cnn : nns) {
+ InetSocketAddress isa = cnn.getAddress();
NamenodeMXBeanHelper nnHelper = null;
try {
nnHelper = new NamenodeMXBeanHelper(isa, conf);
@@ -102,9 +105,10 @@
DecommissionStatus generateDecommissioningReport() {
String clusterid = "";
Configuration conf = new Configuration();
- List<InetSocketAddress> isas = null;
+ List<ConfiguredNNAddress> cnns = null;
try {
- isas = DFSUtil.getNNServiceRpcAddresses(conf);
+ cnns = DFSUtil.flattenAddressMap(
+ DFSUtil.getNNServiceRpcAddresses(conf));
} catch (Exception e) {
// catch any exception encountered other than connecting to namenodes
DecommissionStatus dInfo = new DecommissionStatus(clusterid, e);
@@ -122,7 +126,8 @@
new HashMap<String, Exception>();
List<String> unreportedNamenode = new ArrayList<String>();
- for (InetSocketAddress isa : isas) {
+ for (ConfiguredNNAddress cnn : cnns) {
+ InetSocketAddress isa = cnn.getAddress();
NamenodeMXBeanHelper nnHelper = null;
try {
nnHelper = new NamenodeMXBeanHelper(isa, conf);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java
index 6459ffd..402dcdd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/DfsServlet.java
@@ -26,8 +26,8 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.server.common.JspHelper;
import org.apache.hadoop.ipc.RemoteException;
@@ -77,7 +77,8 @@
NameNodeHttpServer.getNameNodeAddressFromContext(context);
Configuration conf = new HdfsConfiguration(
NameNodeHttpServer.getConfFromContext(context));
- return DFSUtil.createNamenode(nnAddr, conf);
+ return NameNodeProxies.createProxy(conf, NameNode.getUri(nnAddr),
+ ClientProtocol.class).getProxy();
}
protected UserGroupInformation getUGI(HttpServletRequest request,
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
index 637400f..a0fb8fe 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
@@ -133,4 +133,9 @@
public long getLastTxId() throws IOException {
return HdfsConstants.INVALID_TXID;
}
+
+ @Override
+ public boolean isInProgress() {
+ return true;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
index 8c3ad2e..bdb4c5e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
@@ -22,12 +22,14 @@
import java.util.Arrays;
import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.hadoop.hdfs.protocolPB.JournalProtocolTranslatorPB;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.server.common.Storage;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.UserGroupInformation;
/**
* An implementation of the abstract class {@link EditLogOutputStream},
@@ -40,7 +42,7 @@
class EditLogBackupOutputStream extends EditLogOutputStream {
static int DEFAULT_BUFFER_SIZE = 256;
- private JournalProtocolTranslatorPB backupNode; // RPC proxy to backup node
+ private JournalProtocol backupNode; // RPC proxy to backup node
private NamenodeRegistration bnRegistration; // backup node registration
private NamenodeRegistration nnRegistration; // active node registration
private EditsDoubleBuffer doubleBuf;
@@ -55,8 +57,9 @@
InetSocketAddress bnAddress =
NetUtils.createSocketAddr(bnRegistration.getAddress());
try {
- this.backupNode =
- new JournalProtocolTranslatorPB(bnAddress, new HdfsConfiguration());
+ this.backupNode = NameNodeProxies.createNonHAProxy(new HdfsConfiguration(),
+ bnAddress, JournalProtocol.class, UserGroupInformation.getCurrentUser(),
+ true).getProxy();
} catch(IOException e) {
Storage.LOG.error("Error connecting to: " + bnAddress, e);
throw e;
@@ -93,14 +96,14 @@
throw new IOException("BackupEditStream has " + size +
" records still to be flushed and cannot be closed.");
}
- IOUtils.cleanup(Storage.LOG, backupNode); // stop the RPC threads
+ RPC.stopProxy(backupNode); // stop the RPC threads
doubleBuf.close();
doubleBuf = null;
}
@Override
public void abort() throws IOException {
- IOUtils.cleanup(Storage.LOG, backupNode);
+ RPC.stopProxy(backupNode);
doubleBuf = null;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
index 952e4a7..22c1297 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
@@ -41,6 +41,7 @@
private final int logVersion;
private final FSEditLogOp.Reader reader;
private final FSEditLogLoader.PositionTrackingInputStream tracker;
+ private final boolean isInProgress;
/**
* Open an EditLogInputStream for the given file.
@@ -53,7 +54,7 @@
*/
EditLogFileInputStream(File name)
throws LogHeaderCorruptException, IOException {
- this(name, HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID);
+ this(name, HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID, false);
}
/**
@@ -66,7 +67,8 @@
* @throws IOException if an actual IO error occurs while reading the
* header
*/
- EditLogFileInputStream(File name, long firstTxId, long lastTxId)
+ EditLogFileInputStream(File name, long firstTxId, long lastTxId,
+ boolean isInProgress)
throws LogHeaderCorruptException, IOException {
file = name;
fStream = new FileInputStream(name);
@@ -84,6 +86,25 @@
reader = new FSEditLogOp.Reader(in, logVersion);
this.firstTxId = firstTxId;
this.lastTxId = lastTxId;
+ this.isInProgress = isInProgress;
+ }
+
+ /**
+ * Skip over a number of transactions. Subsequent calls to
+ * {@link EditLogFileInputStream#readOp()} will begin after these skipped
+ * transactions. If more transactions are requested to be skipped than remain
+ * in the edit log, all edit log ops in the log will be skipped and subsequent
+ * calls to {@link EditLogInputStream#readOp} will return null.
+ *
+ * @param transactionsToSkip number of transactions to skip over.
+ * @throws IOException if there's an error while reading an operation
+ */
+ public void skipTransactions(long transactionsToSkip) throws IOException {
+ assert firstTxId != HdfsConstants.INVALID_TXID &&
+ lastTxId != HdfsConstants.INVALID_TXID;
+ for (long i = 0; i < transactionsToSkip; i++) {
+ reader.readOp();
+ }
}
@Override
@@ -133,6 +154,11 @@
}
@Override
+ public boolean isInProgress() {
+ return isInProgress;
+ }
+
+ @Override
public String toString() {
return getName();
}
@@ -142,11 +168,11 @@
try {
in = new EditLogFileInputStream(file);
} catch (LogHeaderCorruptException corrupt) {
- // If it's missing its header, this is equivalent to no transactions
+ // If the header is malformed or the wrong value, this indicates a corruption
FSImage.LOG.warn("Log at " + file + " has no valid header",
corrupt);
- return new FSEditLogLoader.EditLogValidation(0, HdfsConstants.INVALID_TXID,
- HdfsConstants.INVALID_TXID);
+ return new FSEditLogLoader.EditLogValidation(0,
+ HdfsConstants.INVALID_TXID, HdfsConstants.INVALID_TXID, true);
}
try {
@@ -172,14 +198,13 @@
throw new LogHeaderCorruptException(
"Reached EOF when reading log header");
}
- if (logVersion < HdfsConstants.LAYOUT_VERSION) { // future version
+ if (logVersion < HdfsConstants.LAYOUT_VERSION || // future version
+ logVersion > Storage.LAST_UPGRADABLE_LAYOUT_VERSION) { // unsupported
throw new LogHeaderCorruptException(
"Unexpected version of the file system log file: "
+ logVersion + ". Current version = "
+ HdfsConstants.LAYOUT_VERSION + ".");
}
- assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION :
- "Unsupported version " + logVersion;
return logVersion;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
index bdc0bd2..f7e1f01 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.server.namenode;
+import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@@ -27,6 +28,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.io.IOUtils;
@@ -36,7 +38,8 @@
* An implementation of the abstract class {@link EditLogOutputStream}, which
* stores edits in a local file.
*/
-class EditLogFileOutputStream extends EditLogOutputStream {
+@InterfaceAudience.Private
+public class EditLogFileOutputStream extends EditLogOutputStream {
private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class);
private File file;
@@ -96,11 +99,23 @@
public void create() throws IOException {
fc.truncate(0);
fc.position(0);
- doubleBuf.getCurrentBuf().writeInt(HdfsConstants.LAYOUT_VERSION);
+ writeHeader(doubleBuf.getCurrentBuf());
setReadyToFlush();
flush();
}
+ /**
+ * Write header information for this EditLogFileOutputStream to the provided
+ * DataOutputSream.
+ *
+ * @param out the output stream to write the header to.
+ * @throws IOException in the event of error writing to the stream.
+ */
+ @VisibleForTesting
+ public static void writeHeader(DataOutputStream out) throws IOException {
+ out.writeInt(HdfsConstants.LAYOUT_VERSION);
+ }
+
@Override
public void close() throws IOException {
if (fp == null) {
@@ -204,6 +219,11 @@
File getFile() {
return file;
}
+
+ @Override
+ public String toString() {
+ return "EditLogFileOutputStream(" + file + ")";
+ }
/**
* @return true if this stream is currently open.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java
new file mode 100644
index 0000000..56edf8c
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputException.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * Thrown when there's a failure to read an edit log op from disk when loading
+ * edits.
+ */
+@InterfaceAudience.Private
+public class EditLogInputException extends IOException {
+
+ private static final long serialVersionUID = 1L;
+
+ private final long numEditsLoaded;
+
+ public EditLogInputException(String message, Throwable cause,
+ long numEditsLoaded) {
+ super(message, cause);
+ this.numEditsLoaded = numEditsLoaded;
+ }
+
+ public long getNumEditsLoaded() {
+ return numEditsLoaded;
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java
index 3ad19951..7a7f8d8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogInputStream.java
@@ -22,6 +22,9 @@
import java.io.Closeable;
import java.io.IOException;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
/**
* A generic abstract class to support reading edits log data from
* persistent storage.
@@ -79,4 +82,9 @@
* Return the size of the current edits log.
*/
public abstract long length() throws IOException;
+
+ /**
+ * Return true if this stream is in progress, false if it is finalized.
+ */
+ public abstract boolean isInProgress();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
index 45ce9df..ab0f4c4 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
@@ -261,113 +261,32 @@
*/
INode unprotectedAddFile( String path,
PermissionStatus permissions,
- BlockInfo[] blocks,
short replication,
long modificationTime,
long atime,
long preferredBlockSize,
+ boolean underConstruction,
String clientName,
String clientMachine)
throws UnresolvedLinkException {
INode newNode;
assert hasWriteLock();
- if (blocks == null)
- newNode = new INodeDirectory(permissions, modificationTime);
- else if(blocks.length == 0 || blocks[blocks.length-1].getBlockUCState()
- == BlockUCState.UNDER_CONSTRUCTION) {
+ if (underConstruction) {
newNode = new INodeFileUnderConstruction(
- permissions, blocks.length, replication,
+ permissions, replication,
preferredBlockSize, modificationTime, clientName,
clientMachine, null);
} else {
- newNode = new INodeFile(permissions, blocks.length, replication,
+ newNode = new INodeFile(permissions, 0, replication,
modificationTime, atime, preferredBlockSize);
}
- writeLock();
+
try {
- try {
- newNode = addNode(path, newNode, UNKNOWN_DISK_SPACE);
- if(newNode != null && blocks != null) {
- int nrBlocks = blocks.length;
- // Add file->block mapping
- INodeFile newF = (INodeFile)newNode;
- for (int i = 0; i < nrBlocks; i++) {
- newF.setBlock(i, getBlockManager().addINode(blocks[i], newF));
- }
- }
- } catch (IOException e) {
- return null;
- }
- return newNode;
- } finally {
- writeUnlock();
+ newNode = addNode(path, newNode, UNKNOWN_DISK_SPACE);
+ } catch (IOException e) {
+ return null;
}
-
- }
-
- /**
- * Update files in-memory data structures with new block information.
- * @throws IOException
- */
- void updateFile(INodeFile file,
- String path,
- BlockInfo[] blocks,
- long mtime,
- long atime) throws IOException {
-
- // Update the salient file attributes.
- file.setAccessTime(atime);
- file.setModificationTimeForce(mtime);
-
- // Update its block list
- BlockInfo[] oldBlocks = file.getBlocks();
-
- // Are we only updating the last block's gen stamp.
- boolean isGenStampUpdate = oldBlocks.length == blocks.length;
-
- // First, update blocks in common
- BlockInfo oldBlock = null;
- for (int i = 0; i < oldBlocks.length && i < blocks.length; i++) {
- oldBlock = oldBlocks[i];
- Block newBlock = blocks[i];
-
- boolean isLastBlock = i == oldBlocks.length - 1;
- if (oldBlock.getBlockId() != newBlock.getBlockId() ||
- (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() &&
- !(isGenStampUpdate && isLastBlock))) {
- throw new IOException("Mismatched block IDs or generation stamps, " +
- "attempting to replace block " + oldBlock + " with " + newBlock +
- " as block # " + i + "/" + blocks.length + " of " + path);
- }
-
- oldBlock.setNumBytes(newBlock.getNumBytes());
- oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
- }
-
- if (blocks.length < oldBlocks.length) {
- // We're removing a block from the file, e.g. abandonBlock(...)
- if (!file.isUnderConstruction()) {
- throw new IOException("Trying to remove a block from file " +
- path + " which is not under construction.");
- }
- if (blocks.length != oldBlocks.length - 1) {
- throw new IOException("Trying to remove more than one block from file "
- + path);
- }
- unprotectedRemoveBlock(path,
- (INodeFileUnderConstruction)file, oldBlocks[oldBlocks.length - 1]);
- } else if (blocks.length > oldBlocks.length) {
- // We're adding blocks
- // First complete last old Block
- getBlockManager().completeBlock(file, oldBlocks.length-1, true);
- // Add the new blocks
- for (int i = oldBlocks.length; i < blocks.length; i++) {
- // addBlock();
- BlockInfo newBI = blocks[i];
- getBlockManager().addINode(newBI, file);
- file.addBlock(newBI);
- }
- }
+ return newNode;
}
INodeDirectory addToParent(byte[] src, INodeDirectory parentINode,
@@ -450,7 +369,7 @@
writeLock();
try {
- fsImage.getEditLog().logOpenFile(path, file);
+ fsImage.getEditLog().logUpdateBlocks(path, file);
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* FSDirectory.persistBlocks: "
+path+" with "+ file.getBlocks().length
@@ -460,7 +379,7 @@
writeUnlock();
}
}
-
+
/**
* Close file.
*/
@@ -483,7 +402,7 @@
}
/**
- * Remove a block to the file.
+ * Remove a block from the file.
*/
boolean removeBlock(String path, INodeFileUnderConstruction fileNode,
Block block) throws IOException {
@@ -499,7 +418,7 @@
}
return true;
}
-
+
void unprotectedRemoveBlock(String path, INodeFileUnderConstruction fileNode,
Block block) throws IOException {
// modify file-> block and blocksMap
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
index 80c6088..7c630d7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
@@ -62,22 +62,36 @@
/**
* State machine for edit log.
+ *
+ * In a non-HA setup:
+ *
* The log starts in UNITIALIZED state upon construction. Once it's
- * initialized, it is usually in IN_SEGMENT state, indicating that edits
- * may be written. In the middle of a roll, or while saving the namespace,
- * it briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the
- * previous segment has been closed, but the new one has not yet been opened.
+ * initialized, it is usually in IN_SEGMENT state, indicating that edits may
+ * be written. In the middle of a roll, or while saving the namespace, it
+ * briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the previous
+ * segment has been closed, but the new one has not yet been opened.
+ *
+ * In an HA setup:
+ *
+ * The log starts in UNINITIALIZED state upon construction. Once it's
+ * initialized, it sits in the OPEN_FOR_READING state the entire time that the
+ * NN is in standby. Upon the NN transition to active, the log will be CLOSED,
+ * and then move to being BETWEEN_LOG_SEGMENTS, much as if the NN had just
+ * started up, and then will move to IN_SEGMENT so it can begin writing to the
+ * log. The log states will then revert to behaving as they do in a non-HA
+ * setup.
*/
private enum State {
UNINITIALIZED,
BETWEEN_LOG_SEGMENTS,
IN_SEGMENT,
+ OPEN_FOR_READING,
CLOSED;
}
private State state = State.UNINITIALIZED;
//initialize
- private JournalSet journalSet;
+ private JournalSet journalSet = null;
private EditLogOutputStream editLogStream = null;
// a monotonically increasing counter that represents transactionIds.
@@ -112,7 +126,12 @@
private NNStorage storage;
private Configuration conf;
- private Collection<URI> editsDirs;
+ private List<URI> editsDirs;
+
+ /**
+ * The edit directories that are shared between primary and secondary.
+ */
+ private List<URI> sharedEditsDirs;
private static class TransactionId {
public long txid;
@@ -151,11 +170,11 @@
* @param storage Storage object used by namenode
* @param editsDirs List of journals to use
*/
- FSEditLog(Configuration conf, NNStorage storage, Collection<URI> editsDirs) {
+ FSEditLog(Configuration conf, NNStorage storage, List<URI> editsDirs) {
init(conf, storage, editsDirs);
}
- private void init(Configuration conf, NNStorage storage, Collection<URI> editsDirs) {
+ private void init(Configuration conf, NNStorage storage, List<URI> editsDirs) {
isSyncRunning = false;
this.conf = conf;
this.storage = storage;
@@ -165,19 +184,44 @@
// If this list is empty, an error will be thrown on first use
// of the editlog, as no journals will exist
this.editsDirs = Lists.newArrayList(editsDirs);
+
+ this.sharedEditsDirs = FSNamesystem.getSharedEditsDirs(conf);
+ }
+
+ public synchronized void initJournalsForWrite() {
+ Preconditions.checkState(state == State.UNINITIALIZED ||
+ state == State.CLOSED, "Unexpected state: %s", state);
+ initJournals(this.editsDirs);
+ state = State.BETWEEN_LOG_SEGMENTS;
+ }
+
+ public synchronized void initSharedJournalsForRead() {
+ if (state == State.OPEN_FOR_READING) {
+ LOG.warn("Initializing shared journals for READ, already open for READ",
+ new Exception());
+ return;
+ }
+ Preconditions.checkState(state == State.UNINITIALIZED ||
+ state == State.CLOSED);
+
+ initJournals(this.sharedEditsDirs);
+ state = State.OPEN_FOR_READING;
+ }
+
+ private synchronized void initJournals(List<URI> dirs) {
int minimumRedundantJournals = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY,
DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT);
journalSet = new JournalSet(minimumRedundantJournals);
- for (URI u : this.editsDirs) {
+ for (URI u : dirs) {
boolean required = FSNamesystem.getRequiredNamespaceEditsDirs(conf)
.contains(u);
if (u.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
StorageDirectory sd = storage.getStorageDirectory(u);
if (sd != null) {
- journalSet.add(new FileJournalManager(sd), required);
+ journalSet.add(new FileJournalManager(sd, storage), required);
}
} else {
journalSet.add(createJournal(u), required);
@@ -187,7 +231,6 @@
if (journalSet.isEmpty()) {
LOG.error("No edits directories configured!");
}
- state = State.BETWEEN_LOG_SEGMENTS;
}
/**
@@ -202,18 +245,51 @@
* Initialize the output stream for logging, opening the first
* log segment.
*/
- synchronized void open() throws IOException {
- Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS);
+ synchronized void openForWrite() throws IOException {
+ Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS,
+ "Bad state: %s", state);
- startLogSegment(getLastWrittenTxId() + 1, true);
+ long segmentTxId = getLastWrittenTxId() + 1;
+ // Safety check: we should never start a segment if there are
+ // newer txids readable.
+ EditLogInputStream s = journalSet.getInputStream(segmentTxId, true);
+ try {
+ Preconditions.checkState(s == null,
+ "Cannot start writing at txid %s when there is a stream " +
+ "available for read: %s", segmentTxId, s);
+ } finally {
+ IOUtils.closeStream(s);
+ }
+
+ startLogSegment(segmentTxId, true);
assert state == State.IN_SEGMENT : "Bad state: " + state;
}
- synchronized boolean isOpen() {
+ /**
+ * @return true if the log is currently open in write mode, regardless
+ * of whether it actually has an open segment.
+ */
+ synchronized boolean isOpenForWrite() {
+ return state == State.IN_SEGMENT ||
+ state == State.BETWEEN_LOG_SEGMENTS;
+ }
+
+ /**
+ * @return true if the log is open in write mode and has a segment open
+ * ready to take edits.
+ */
+ synchronized boolean isSegmentOpen() {
return state == State.IN_SEGMENT;
}
/**
+ * @return true if the log is open in read mode.
+ */
+ public synchronized boolean isOpenForRead() {
+ return state == State.OPEN_FOR_READING;
+ }
+
+ /**
* Shutdown the file store.
*/
synchronized void close() {
@@ -242,7 +318,8 @@
*/
void logEdit(final FSEditLogOp op) {
synchronized (this) {
- assert state != State.CLOSED;
+ assert isOpenForWrite() :
+ "bad state: " + state;
// wait if an automatic sync is scheduled
waitIfAutoSyncScheduled();
@@ -329,7 +406,7 @@
/**
* Return the transaction ID of the last transaction written to the log.
*/
- synchronized long getLastWrittenTxId() {
+ public synchronized long getLastWrittenTxId() {
return txid;
}
@@ -337,7 +414,7 @@
* @return the first transaction ID in the current log segment
*/
synchronized long getCurSegmentTxId() {
- Preconditions.checkState(state == State.IN_SEGMENT,
+ Preconditions.checkState(isSegmentOpen(),
"Bad state: %s", state);
return curSegmentTxId;
}
@@ -549,6 +626,13 @@
logEdit(op);
}
+ public void logUpdateBlocks(String path, INodeFileUnderConstruction file) {
+ UpdateBlocksOp op = UpdateBlocksOp.getInstance()
+ .setPath(path)
+ .setBlocks(file.getBlocks());
+ logEdit(op);
+ }
+
/**
* Add create directory record to edit log
*/
@@ -724,16 +808,25 @@
* Used only by unit tests.
*/
@VisibleForTesting
- List<JournalAndStream> getJournals() {
+ synchronized List<JournalAndStream> getJournals() {
return journalSet.getAllJournalStreams();
}
/**
+ * Used only by tests.
+ */
+ @VisibleForTesting
+ synchronized public JournalSet getJournalSet() {
+ return journalSet;
+ }
+
+ /**
* Used only by unit tests.
*/
@VisibleForTesting
synchronized void setRuntimeForTesting(Runtime runtime) {
this.runtime = runtime;
+ this.journalSet.setRuntimeForTesting(runtime);
}
/**
@@ -796,7 +889,7 @@
editLogStream = journalSet.startLogSegment(segmentTxId);
} catch (IOException ex) {
throw new IOException("Unable to start log segment " +
- segmentTxId + ": no journals successfully started.");
+ segmentTxId + ": too few journals successfully started.", ex);
}
curSegmentTxId = segmentTxId;
@@ -815,7 +908,7 @@
*/
synchronized void endCurrentLogSegment(boolean writeEndTxn) {
LOG.info("Ending log segment " + curSegmentTxId);
- Preconditions.checkState(state == State.IN_SEGMENT,
+ Preconditions.checkState(isSegmentOpen(),
"Bad state: %s", state);
if (writeEndTxn) {
@@ -847,6 +940,7 @@
if (editLogStream != null) {
editLogStream.abort();
editLogStream = null;
+ state = State.BETWEEN_LOG_SEGMENTS;
}
} catch (IOException e) {
LOG.warn("All journals failed to abort", e);
@@ -856,17 +950,14 @@
/**
* Archive any log files that are older than the given txid.
*/
- public void purgeLogsOlderThan(final long minTxIdToKeep) {
- synchronized (this) {
- // synchronized to prevent findbugs warning about inconsistent
- // synchronization. This will be JIT-ed out if asserts are
- // off.
- assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this is no-op
- minTxIdToKeep <= curSegmentTxId :
- "cannot purge logs older than txid " + minTxIdToKeep +
- " when current segment starts at " + curSegmentTxId;
- }
+ public synchronized void purgeLogsOlderThan(final long minTxIdToKeep) {
+ assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this is no-op
+ minTxIdToKeep <= curSegmentTxId :
+ "cannot purge logs older than txid " + minTxIdToKeep +
+ " when current segment starts at " + curSegmentTxId;
+ // This could be improved to not need synchronization. But currently,
+ // journalSet is not threadsafe, so we need to synchronize this method.
try {
journalSet.purgeLogsOlderThan(minTxIdToKeep);
} catch (IOException ex) {
@@ -898,8 +989,8 @@
// sets the initial capacity of the flush buffer.
- public void setOutputBufferCapacity(int size) {
- journalSet.setOutputBufferCapacity(size);
+ synchronized void setOutputBufferCapacity(int size) {
+ journalSet.setOutputBufferCapacity(size);
}
/**
@@ -975,32 +1066,45 @@
/**
* Run recovery on all journals to recover any unclosed segments
*/
- void recoverUnclosedStreams() {
+ synchronized void recoverUnclosedStreams() {
+ Preconditions.checkState(
+ state == State.BETWEEN_LOG_SEGMENTS,
+ "May not recover segments - wrong state: %s", state);
try {
journalSet.recoverUnfinalizedSegments();
} catch (IOException ex) {
// All journals have failed, it is handled in logSync.
}
}
+
+ Collection<EditLogInputStream> selectInputStreams(long fromTxId,
+ long toAtLeastTxId) throws IOException {
+ return selectInputStreams(fromTxId, toAtLeastTxId, true);
+ }
/**
* Select a list of input streams to load.
+ *
* @param fromTxId first transaction in the selected streams
* @param toAtLeast the selected streams must contain this transaction
+ * @param inProgessOk set to true if in-progress streams are OK
*/
- Collection<EditLogInputStream> selectInputStreams(long fromTxId,
- long toAtLeastTxId) throws IOException {
+ public synchronized Collection<EditLogInputStream> selectInputStreams(long fromTxId,
+ long toAtLeastTxId, boolean inProgressOk) throws IOException {
List<EditLogInputStream> streams = new ArrayList<EditLogInputStream>();
- EditLogInputStream stream = journalSet.getInputStream(fromTxId);
+ EditLogInputStream stream = journalSet.getInputStream(fromTxId, inProgressOk);
while (stream != null) {
- fromTxId = stream.getLastTxId() + 1;
streams.add(stream);
- stream = journalSet.getInputStream(fromTxId);
+ // We're now looking for a higher range, so reset the fromTxId
+ fromTxId = stream.getLastTxId() + 1;
+ stream = journalSet.getInputStream(fromTxId, inProgressOk);
}
+
if (fromTxId <= toAtLeastTxId) {
closeAllStreams(streams);
- throw new IOException("No non-corrupt logs for txid "
- + fromTxId);
+ throw new IOException(String.format("Gap in transactions. Expected to "
+ + "be able to read up until at least txid %d but unable to find any "
+ + "edit logs containing txid %d", toAtLeastTxId, fromTxId));
}
return streams;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
index b93942d..7c24107 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
@@ -28,6 +28,7 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
@@ -36,6 +37,7 @@
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.BlockListUpdatingOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
@@ -54,9 +56,12 @@
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SymlinkOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateBlocksOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.UpdateMasterKeyOp;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
import org.apache.hadoop.hdfs.util.Holder;
+import org.apache.hadoop.io.IOUtils;
+
import com.google.common.base.Joiner;
@InterfaceAudience.Private
@@ -73,40 +78,32 @@
* This is where we apply edits that we've been writing to disk all
* along.
*/
- int loadFSEdits(EditLogInputStream edits, long expectedStartingTxId)
- throws IOException {
- long startTime = now();
- int numEdits = loadFSEdits(edits, true, expectedStartingTxId);
- FSImage.LOG.info("Edits file " + edits.getName()
- + " of size " + edits.length() + " edits # " + numEdits
- + " loaded in " + (now()-startTime)/1000 + " seconds.");
- return numEdits;
- }
-
- int loadFSEdits(EditLogInputStream edits, boolean closeOnExit,
- long expectedStartingTxId)
+ long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId)
throws IOException {
- int numEdits = 0;
+ long numEdits = 0;
int logVersion = edits.getVersion();
+ fsNamesys.writeLock();
try {
+ long startTime = now();
numEdits = loadEditRecords(logVersion, edits, false,
expectedStartingTxId);
+ FSImage.LOG.info("Edits file " + edits.getName()
+ + " of size " + edits.length() + " edits # " + numEdits
+ + " loaded in " + (now()-startTime)/1000 + " seconds.");
} finally {
- if(closeOnExit) {
- edits.close();
- }
+ edits.close();
+ fsNamesys.writeUnlock();
}
return numEdits;
}
- @SuppressWarnings("deprecation")
- int loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit,
+ long loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit,
long expectedStartingTxId)
- throws IOException {
+ throws IOException, EditLogInputException {
FSDirectory fsDir = fsNamesys.dir;
- int numEdits = 0;
+ long numEdits = 0;
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
@@ -120,9 +117,20 @@
long txId = expectedStartingTxId - 1;
try {
try {
- FSEditLogOp op;
- while ((op = in.readOp()) != null) {
- recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
+ while (true) {
+ FSEditLogOp op;
+ try {
+ if ((op = in.readOp()) == null) {
+ break;
+ }
+ } catch (IOException ioe) {
+ long badTxId = txId + 1; // because txId hasn't been incremented yet
+ String errorMessage = formatEditLogReplayError(in, recentOpcodeOffsets, badTxId);
+ FSImage.LOG.error(errorMessage);
+ throw new EditLogInputException(errorMessage,
+ ioe, numEdits);
+ }
+ recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] =
in.getPosition();
if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) {
long expectedTxId = txId + 1;
@@ -133,310 +141,442 @@
}
}
- numEdits++;
incrOpCount(op.opCode, opCounts);
- switch (op.opCode) {
- case OP_ADD:
- case OP_CLOSE: {
- AddCloseOp addCloseOp = (AddCloseOp)op;
-
- // versions > 0 support per file replication
- // get name and replication
- final short replication = fsNamesys.getBlockManager(
- ).adjustReplication(addCloseOp.replication);
-
- long blockSize = addCloseOp.blockSize;
- BlockInfo blocks[] = new BlockInfo[addCloseOp.blocks.length];
- for (int i = 0; i < addCloseOp.blocks.length; i++) {
- if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD
- && i == addCloseOp.blocks.length-1) {
- blocks[i] = new BlockInfoUnderConstruction(addCloseOp.blocks[i],
- replication);
- } else {
- blocks[i] = new BlockInfo(addCloseOp.blocks[i], replication);
- }
- }
-
- PermissionStatus permissions = fsNamesys.getUpgradePermission();
- if (addCloseOp.permissions != null) {
- permissions = addCloseOp.permissions;
- }
-
-
- // Older versions of HDFS does not store the block size in inode.
- // If the file has more than one block, use the size of the
- // first block as the blocksize. Otherwise use the default
- // block size.
- if (-8 <= logVersion && blockSize == 0) {
- if (blocks.length > 1) {
- blockSize = blocks[0].getNumBytes();
- } else {
- long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0);
- blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
- }
- }
-
-
- // The open lease transaction re-creates a file if necessary.
- // Delete the file if it already exists.
- if (FSNamesystem.LOG.isDebugEnabled()) {
- FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
- " numblocks : " + blocks.length +
- " clientHolder " + addCloseOp.clientName +
- " clientMachine " + addCloseOp.clientMachine);
- }
-
- // There are four cases here:
- // 1. OP_ADD to create a new file
- // 2. OP_ADD to update file blocks
- // 3. OP_ADD to open file for append
- // 4. OP_CLOSE to close the file
-
- // See if the file already exists
- INodeFile oldFile = fsDir.getFileINode(addCloseOp.path);
- if (oldFile == null) { // OP_ADD for a new file
- assert addCloseOp.opCode == FSEditLogOpCodes.OP_ADD :
- "Expected opcode OP_ADD, but got " + addCloseOp.opCode;
- fsDir.unprotectedAddFile(
- addCloseOp.path, permissions, blocks, replication,
- addCloseOp.mtime, addCloseOp.atime, blockSize,
- addCloseOp.clientName, addCloseOp.clientMachine);
- } else {
- fsDir.updateFile(oldFile, addCloseOp.path, blocks,
- addCloseOp.mtime, addCloseOp.atime);
- if(addCloseOp.opCode == FSEditLogOpCodes.OP_CLOSE) { // OP_CLOSE
- if (!oldFile.isUnderConstruction() &&
- logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
- // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
- // could show up twice in a row. But after that version, this
- // should be fixed, so we should treat it as an error.
- throw new IOException(
- "File is not under construction: " + addCloseOp.path);
- }
- fsNamesys.getBlockManager().completeBlock(
- oldFile, blocks.length-1, true);
-
- if (oldFile.isUnderConstruction()) {
- INodeFile newFile =
- ((INodeFileUnderConstruction)oldFile).convertToInodeFile();
- fsDir.replaceNode(addCloseOp.path, oldFile, newFile);
- }
- } else if(! oldFile.isUnderConstruction()) { // OP_ADD for append
- INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
- oldFile.getLocalNameBytes(),
- oldFile.getReplication(),
- oldFile.getModificationTime(),
- oldFile.getPreferredBlockSize(),
- oldFile.getBlocks(),
- oldFile.getPermissionStatus(),
- addCloseOp.clientName,
- addCloseOp.clientMachine,
- null);
- fsDir.replaceNode(addCloseOp.path, oldFile, cons);
- }
- }
- // Update file lease
- if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD) {
- fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
- } else { // Ops.OP_CLOSE
- if (oldFile.isUnderConstruction()) {
- fsNamesys.leaseManager.removeLease(
- ((INodeFileUnderConstruction)oldFile).getClientName(), addCloseOp.path);
- }
- }
- break;
+ try {
+ applyEditLogOp(op, fsDir, logVersion);
+ } catch (Throwable t) {
+ // Catch Throwable because in the case of a truly corrupt edits log, any
+ // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
+ String errorMessage = formatEditLogReplayError(in, recentOpcodeOffsets, txId);
+ FSImage.LOG.error(errorMessage);
+ throw new IOException(errorMessage, t);
}
- case OP_SET_REPLICATION: {
- SetReplicationOp setReplicationOp = (SetReplicationOp)op;
- short replication = fsNamesys.getBlockManager().adjustReplication(
- setReplicationOp.replication);
- fsDir.unprotectedSetReplication(setReplicationOp.path,
- replication, null);
- break;
- }
- case OP_CONCAT_DELETE: {
- ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
- fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
- concatDeleteOp.timestamp);
- break;
- }
- case OP_RENAME_OLD: {
- RenameOldOp renameOp = (RenameOldOp)op;
- HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false);
- fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
- renameOp.timestamp);
- fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo);
- break;
- }
- case OP_DELETE: {
- DeleteOp deleteOp = (DeleteOp)op;
- fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
- break;
- }
- case OP_MKDIR: {
- MkdirOp mkdirOp = (MkdirOp)op;
- PermissionStatus permissions = fsNamesys.getUpgradePermission();
- if (mkdirOp.permissions != null) {
- permissions = mkdirOp.permissions;
- }
-
- fsDir.unprotectedMkdir(mkdirOp.path, permissions,
- mkdirOp.timestamp);
- break;
- }
- case OP_SET_GENSTAMP: {
- SetGenstampOp setGenstampOp = (SetGenstampOp)op;
- fsNamesys.setGenerationStamp(setGenstampOp.genStamp);
- break;
- }
- case OP_SET_PERMISSIONS: {
- SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
- fsDir.unprotectedSetPermission(setPermissionsOp.src,
- setPermissionsOp.permissions);
- break;
- }
- case OP_SET_OWNER: {
- SetOwnerOp setOwnerOp = (SetOwnerOp)op;
- fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
- setOwnerOp.groupname);
- break;
- }
- case OP_SET_NS_QUOTA: {
- SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
- fsDir.unprotectedSetQuota(setNSQuotaOp.src,
- setNSQuotaOp.nsQuota,
- HdfsConstants.QUOTA_DONT_SET);
- break;
- }
- case OP_CLEAR_NS_QUOTA: {
- ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
- fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
- HdfsConstants.QUOTA_RESET,
- HdfsConstants.QUOTA_DONT_SET);
- break;
- }
-
- case OP_SET_QUOTA:
- SetQuotaOp setQuotaOp = (SetQuotaOp)op;
- fsDir.unprotectedSetQuota(setQuotaOp.src,
- setQuotaOp.nsQuota,
- setQuotaOp.dsQuota);
- break;
-
- case OP_TIMES: {
- TimesOp timesOp = (TimesOp)op;
-
- fsDir.unprotectedSetTimes(timesOp.path,
- timesOp.mtime,
- timesOp.atime, true);
- break;
- }
- case OP_SYMLINK: {
- SymlinkOp symlinkOp = (SymlinkOp)op;
- fsDir.unprotectedSymlink(symlinkOp.path, symlinkOp.value,
- symlinkOp.mtime, symlinkOp.atime,
- symlinkOp.permissionStatus);
- break;
- }
- case OP_RENAME: {
- RenameOp renameOp = (RenameOp)op;
-
- HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false);
- fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
- renameOp.timestamp, renameOp.options);
- fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo);
- break;
- }
- case OP_GET_DELEGATION_TOKEN: {
- GetDelegationTokenOp getDelegationTokenOp
- = (GetDelegationTokenOp)op;
-
- fsNamesys.getDelegationTokenSecretManager()
- .addPersistedDelegationToken(getDelegationTokenOp.token,
- getDelegationTokenOp.expiryTime);
- break;
- }
- case OP_RENEW_DELEGATION_TOKEN: {
- RenewDelegationTokenOp renewDelegationTokenOp
- = (RenewDelegationTokenOp)op;
- fsNamesys.getDelegationTokenSecretManager()
- .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
- renewDelegationTokenOp.expiryTime);
- break;
- }
- case OP_CANCEL_DELEGATION_TOKEN: {
- CancelDelegationTokenOp cancelDelegationTokenOp
- = (CancelDelegationTokenOp)op;
- fsNamesys.getDelegationTokenSecretManager()
- .updatePersistedTokenCancellation(
- cancelDelegationTokenOp.token);
- break;
- }
- case OP_UPDATE_MASTER_KEY: {
- UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
- fsNamesys.getDelegationTokenSecretManager()
- .updatePersistedMasterKey(updateMasterKeyOp.key);
- break;
- }
- case OP_REASSIGN_LEASE: {
- ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
-
- Lease lease = fsNamesys.leaseManager.getLease(
- reassignLeaseOp.leaseHolder);
- INodeFileUnderConstruction pendingFile =
- (INodeFileUnderConstruction) fsDir.getFileINode(
- reassignLeaseOp.path);
- fsNamesys.reassignLeaseInternal(lease,
- reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
- break;
- }
- case OP_START_LOG_SEGMENT:
- case OP_END_LOG_SEGMENT: {
- // no data in here currently.
- break;
- }
- case OP_DATANODE_ADD:
- case OP_DATANODE_REMOVE:
- break;
- default:
- throw new IOException("Invalid operation read " + op.opCode);
- }
+ numEdits++;
}
-
} catch (IOException ex) {
check203UpgradeFailure(logVersion, ex);
} finally {
if(closeOnExit)
in.close();
}
- } catch (Throwable t) {
- // Catch Throwable because in the case of a truly corrupt edits log, any
- // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
- StringBuilder sb = new StringBuilder();
- sb.append("Error replaying edit log at offset " + in.getPosition());
- sb.append("On transaction ID ").append(txId);
- if (recentOpcodeOffsets[0] != -1) {
- Arrays.sort(recentOpcodeOffsets);
- sb.append("\nRecent opcode offsets:");
- for (long offset : recentOpcodeOffsets) {
- if (offset != -1) {
- sb.append(' ').append(offset);
- }
- }
- }
- String errorMessage = sb.toString();
- FSImage.LOG.error(errorMessage);
- throw new IOException(errorMessage, t);
} finally {
fsDir.writeUnlock();
fsNamesys.writeUnlock();
- }
- if (FSImage.LOG.isDebugEnabled()) {
- dumpOpCounts(opCounts);
+ if (FSImage.LOG.isDebugEnabled()) {
+ dumpOpCounts(opCounts);
+ }
}
return numEdits;
}
+
+ @SuppressWarnings("deprecation")
+ private void applyEditLogOp(FSEditLogOp op, FSDirectory fsDir,
+ int logVersion) throws IOException {
+ switch (op.opCode) {
+ case OP_ADD: {
+ AddCloseOp addCloseOp = (AddCloseOp)op;
+ if (FSNamesystem.LOG.isDebugEnabled()) {
+ FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
+ " numblocks : " + addCloseOp.blocks.length +
+ " clientHolder " + addCloseOp.clientName +
+ " clientMachine " + addCloseOp.clientMachine);
+ }
+ // There three cases here:
+ // 1. OP_ADD to create a new file
+ // 2. OP_ADD to update file blocks
+ // 3. OP_ADD to open file for append
+ // See if the file already exists (persistBlocks call)
+ INodeFile oldFile = getINodeFile(fsDir, addCloseOp.path);
+ INodeFile newFile = oldFile;
+ if (oldFile == null) { // this is OP_ADD on a new file (case 1)
+ // versions > 0 support per file replication
+ // get name and replication
+ final short replication = fsNamesys.getBlockManager(
+ ).adjustReplication(addCloseOp.replication);
+ PermissionStatus permissions = fsNamesys.getUpgradePermission();
+ if (addCloseOp.permissions != null) {
+ permissions = addCloseOp.permissions;
+ }
+ long blockSize = addCloseOp.blockSize;
+
+ // Versions of HDFS prior to 0.17 may log an OP_ADD transaction
+ // which includes blocks in it. When we update the minimum
+ // upgrade version to something more recent than 0.17, we can
+ // simplify this code by asserting that OP_ADD transactions
+ // don't have any blocks.
+
+ // Older versions of HDFS does not store the block size in inode.
+ // If the file has more than one block, use the size of the
+ // first block as the blocksize. Otherwise use the default
+ // block size.
+ if (-8 <= logVersion && blockSize == 0) {
+ if (addCloseOp.blocks.length > 1) {
+ blockSize = addCloseOp.blocks[0].getNumBytes();
+ } else {
+ long first = ((addCloseOp.blocks.length == 1)?
+ addCloseOp.blocks[0].getNumBytes(): 0);
+ blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
+ }
+ }
+
+ // add to the file tree
+ newFile = (INodeFile)fsDir.unprotectedAddFile(
+ addCloseOp.path, permissions,
+ replication, addCloseOp.mtime,
+ addCloseOp.atime, blockSize,
+ true, addCloseOp.clientName, addCloseOp.clientMachine);
+ fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
+
+ } else { // This is OP_ADD on an existing file
+ if (!oldFile.isUnderConstruction()) {
+ // This is case 3: a call to append() on an already-closed file.
+ if (FSNamesystem.LOG.isDebugEnabled()) {
+ FSNamesystem.LOG.debug("Reopening an already-closed file " +
+ "for append");
+ }
+ fsNamesys.prepareFileForWrite(addCloseOp.path, oldFile,
+ addCloseOp.clientName, addCloseOp.clientMachine, null,
+ false);
+ newFile = getINodeFile(fsDir, addCloseOp.path);
+ }
+ }
+ // Fall-through for case 2.
+ // Regardless of whether it's a new file or an updated file,
+ // update the block list.
+
+ // Update the salient file attributes.
+ newFile.setAccessTime(addCloseOp.atime);
+ newFile.setModificationTimeForce(addCloseOp.mtime);
+ updateBlocks(fsDir, addCloseOp, newFile);
+ break;
+ }
+ case OP_CLOSE: {
+ AddCloseOp addCloseOp = (AddCloseOp)op;
+
+ if (FSNamesystem.LOG.isDebugEnabled()) {
+ FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
+ " numblocks : " + addCloseOp.blocks.length +
+ " clientHolder " + addCloseOp.clientName +
+ " clientMachine " + addCloseOp.clientMachine);
+ }
+
+ INodeFile oldFile = getINodeFile(fsDir, addCloseOp.path);
+ if (oldFile == null) {
+ throw new IOException("Operation trying to close non-existent file " +
+ addCloseOp.path);
+ }
+
+ // Update in-memory data structures
+ updateBlocks(fsDir, addCloseOp, oldFile);
+
+ // Now close the file
+ if (!oldFile.isUnderConstruction() &&
+ logVersion <= LayoutVersion.BUGFIX_HDFS_2991_VERSION) {
+ // There was a bug (HDFS-2991) in hadoop < 0.23.1 where OP_CLOSE
+ // could show up twice in a row. But after that version, this
+ // should be fixed, so we should treat it as an error.
+ throw new IOException(
+ "File is not under construction: " + addCloseOp.path);
+ }
+ // One might expect that you could use removeLease(holder, path) here,
+ // but OP_CLOSE doesn't serialize the holder. So, remove by path.
+ if (oldFile.isUnderConstruction()) {
+ INodeFileUnderConstruction ucFile = (INodeFileUnderConstruction) oldFile;
+ fsNamesys.leaseManager.removeLeaseWithPrefixPath(addCloseOp.path);
+ INodeFile newFile = ucFile.convertToInodeFile();
+ fsDir.replaceNode(addCloseOp.path, ucFile, newFile);
+ }
+ break;
+ }
+ case OP_UPDATE_BLOCKS: {
+ UpdateBlocksOp updateOp = (UpdateBlocksOp)op;
+ if (FSNamesystem.LOG.isDebugEnabled()) {
+ FSNamesystem.LOG.debug(op.opCode + ": " + updateOp.path +
+ " numblocks : " + updateOp.blocks.length);
+ }
+ INodeFile oldFile = getINodeFile(fsDir, updateOp.path);
+ if (oldFile == null) {
+ throw new IOException(
+ "Operation trying to update blocks in non-existent file " +
+ updateOp.path);
+ }
+
+ // Update in-memory data structures
+ updateBlocks(fsDir, updateOp, oldFile);
+ break;
+ }
+
+ case OP_SET_REPLICATION: {
+ SetReplicationOp setReplicationOp = (SetReplicationOp)op;
+ short replication = fsNamesys.getBlockManager().adjustReplication(
+ setReplicationOp.replication);
+ fsDir.unprotectedSetReplication(setReplicationOp.path,
+ replication, null);
+ break;
+ }
+ case OP_CONCAT_DELETE: {
+ ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
+ fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
+ concatDeleteOp.timestamp);
+ break;
+ }
+ case OP_RENAME_OLD: {
+ RenameOldOp renameOp = (RenameOldOp)op;
+ HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false);
+ fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
+ renameOp.timestamp);
+ fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo);
+ break;
+ }
+ case OP_DELETE: {
+ DeleteOp deleteOp = (DeleteOp)op;
+ fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
+ break;
+ }
+ case OP_MKDIR: {
+ MkdirOp mkdirOp = (MkdirOp)op;
+ PermissionStatus permissions = fsNamesys.getUpgradePermission();
+ if (mkdirOp.permissions != null) {
+ permissions = mkdirOp.permissions;
+ }
+
+ fsDir.unprotectedMkdir(mkdirOp.path, permissions,
+ mkdirOp.timestamp);
+ break;
+ }
+ case OP_SET_GENSTAMP: {
+ SetGenstampOp setGenstampOp = (SetGenstampOp)op;
+ fsNamesys.setGenerationStamp(setGenstampOp.genStamp);
+ break;
+ }
+ case OP_SET_PERMISSIONS: {
+ SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
+ fsDir.unprotectedSetPermission(setPermissionsOp.src,
+ setPermissionsOp.permissions);
+ break;
+ }
+ case OP_SET_OWNER: {
+ SetOwnerOp setOwnerOp = (SetOwnerOp)op;
+ fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
+ setOwnerOp.groupname);
+ break;
+ }
+ case OP_SET_NS_QUOTA: {
+ SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
+ fsDir.unprotectedSetQuota(setNSQuotaOp.src,
+ setNSQuotaOp.nsQuota,
+ HdfsConstants.QUOTA_DONT_SET);
+ break;
+ }
+ case OP_CLEAR_NS_QUOTA: {
+ ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
+ fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
+ HdfsConstants.QUOTA_RESET,
+ HdfsConstants.QUOTA_DONT_SET);
+ break;
+ }
+
+ case OP_SET_QUOTA:
+ SetQuotaOp setQuotaOp = (SetQuotaOp)op;
+ fsDir.unprotectedSetQuota(setQuotaOp.src,
+ setQuotaOp.nsQuota,
+ setQuotaOp.dsQuota);
+ break;
+
+ case OP_TIMES: {
+ TimesOp timesOp = (TimesOp)op;
+
+ fsDir.unprotectedSetTimes(timesOp.path,
+ timesOp.mtime,
+ timesOp.atime, true);
+ break;
+ }
+ case OP_SYMLINK: {
+ SymlinkOp symlinkOp = (SymlinkOp)op;
+ fsDir.unprotectedSymlink(symlinkOp.path, symlinkOp.value,
+ symlinkOp.mtime, symlinkOp.atime,
+ symlinkOp.permissionStatus);
+ break;
+ }
+ case OP_RENAME: {
+ RenameOp renameOp = (RenameOp)op;
+
+ HdfsFileStatus dinfo = fsDir.getFileInfo(renameOp.dst, false);
+ fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
+ renameOp.timestamp, renameOp.options);
+ fsNamesys.unprotectedChangeLease(renameOp.src, renameOp.dst, dinfo);
+ break;
+ }
+ case OP_GET_DELEGATION_TOKEN: {
+ GetDelegationTokenOp getDelegationTokenOp
+ = (GetDelegationTokenOp)op;
+
+ fsNamesys.getDelegationTokenSecretManager()
+ .addPersistedDelegationToken(getDelegationTokenOp.token,
+ getDelegationTokenOp.expiryTime);
+ break;
+ }
+ case OP_RENEW_DELEGATION_TOKEN: {
+ RenewDelegationTokenOp renewDelegationTokenOp
+ = (RenewDelegationTokenOp)op;
+ fsNamesys.getDelegationTokenSecretManager()
+ .updatePersistedTokenRenewal(renewDelegationTokenOp.token,
+ renewDelegationTokenOp.expiryTime);
+ break;
+ }
+ case OP_CANCEL_DELEGATION_TOKEN: {
+ CancelDelegationTokenOp cancelDelegationTokenOp
+ = (CancelDelegationTokenOp)op;
+ fsNamesys.getDelegationTokenSecretManager()
+ .updatePersistedTokenCancellation(
+ cancelDelegationTokenOp.token);
+ break;
+ }
+ case OP_UPDATE_MASTER_KEY: {
+ UpdateMasterKeyOp updateMasterKeyOp = (UpdateMasterKeyOp)op;
+ fsNamesys.getDelegationTokenSecretManager()
+ .updatePersistedMasterKey(updateMasterKeyOp.key);
+ break;
+ }
+ case OP_REASSIGN_LEASE: {
+ ReassignLeaseOp reassignLeaseOp = (ReassignLeaseOp)op;
+
+ Lease lease = fsNamesys.leaseManager.getLease(
+ reassignLeaseOp.leaseHolder);
+ INodeFileUnderConstruction pendingFile =
+ (INodeFileUnderConstruction) fsDir.getFileINode(
+ reassignLeaseOp.path);
+ fsNamesys.reassignLeaseInternal(lease,
+ reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
+ break;
+ }
+ case OP_START_LOG_SEGMENT:
+ case OP_END_LOG_SEGMENT: {
+ // no data in here currently.
+ break;
+ }
+ case OP_DATANODE_ADD:
+ case OP_DATANODE_REMOVE:
+ break;
+ default:
+ throw new IOException("Invalid operation read " + op.opCode);
+ }
+ }
+
+ private static String formatEditLogReplayError(EditLogInputStream in,
+ long recentOpcodeOffsets[], long txid) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Error replaying edit log at offset " + in.getPosition());
+ sb.append(" on transaction ID ").append(txid);
+ if (recentOpcodeOffsets[0] != -1) {
+ Arrays.sort(recentOpcodeOffsets);
+ sb.append("\nRecent opcode offsets:");
+ for (long offset : recentOpcodeOffsets) {
+ if (offset != -1) {
+ sb.append(' ').append(offset);
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ private static INodeFile getINodeFile(FSDirectory fsDir, String path)
+ throws IOException {
+ INode inode = fsDir.getINode(path);
+ if (inode != null) {
+ if (!(inode instanceof INodeFile)) {
+ throw new IOException("Operation trying to get non-file " + path);
+ }
+ }
+ return (INodeFile)inode;
+ }
+
+ /**
+ * Update in-memory data structures with new block information.
+ * @throws IOException
+ */
+ private void updateBlocks(FSDirectory fsDir, BlockListUpdatingOp op,
+ INodeFile file) throws IOException {
+ // Update its block list
+ BlockInfo[] oldBlocks = file.getBlocks();
+ Block[] newBlocks = op.getBlocks();
+ String path = op.getPath();
+
+ // Are we only updating the last block's gen stamp.
+ boolean isGenStampUpdate = oldBlocks.length == newBlocks.length;
+
+ // First, update blocks in common
+ for (int i = 0; i < oldBlocks.length && i < newBlocks.length; i++) {
+ BlockInfo oldBlock = oldBlocks[i];
+ Block newBlock = newBlocks[i];
+
+ boolean isLastBlock = i == newBlocks.length - 1;
+ if (oldBlock.getBlockId() != newBlock.getBlockId() ||
+ (oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() &&
+ !(isGenStampUpdate && isLastBlock))) {
+ throw new IOException("Mismatched block IDs or generation stamps, " +
+ "attempting to replace block " + oldBlock + " with " + newBlock +
+ " as block # " + i + "/" + newBlocks.length + " of " +
+ path);
+ }
+
+ oldBlock.setNumBytes(newBlock.getNumBytes());
+ boolean changeMade =
+ oldBlock.getGenerationStamp() != newBlock.getGenerationStamp();
+ oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
+
+ if (oldBlock instanceof BlockInfoUnderConstruction &&
+ (!isLastBlock || op.shouldCompleteLastBlock())) {
+ changeMade = true;
+ fsNamesys.getBlockManager().forceCompleteBlock(
+ (INodeFileUnderConstruction)file,
+ (BlockInfoUnderConstruction)oldBlock);
+ }
+ if (changeMade) {
+ // The state or gen-stamp of the block has changed. So, we may be
+ // able to process some messages from datanodes that we previously
+ // were unable to process.
+ fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
+ }
+ }
+
+ if (newBlocks.length < oldBlocks.length) {
+ // We're removing a block from the file, e.g. abandonBlock(...)
+ if (!file.isUnderConstruction()) {
+ throw new IOException("Trying to remove a block from file " +
+ path + " which is not under construction.");
+ }
+ if (newBlocks.length != oldBlocks.length - 1) {
+ throw new IOException("Trying to remove more than one block from file "
+ + path);
+ }
+ fsDir.unprotectedRemoveBlock(path,
+ (INodeFileUnderConstruction)file, oldBlocks[oldBlocks.length - 1]);
+ } else if (newBlocks.length > oldBlocks.length) {
+ // We're adding blocks
+ for (int i = oldBlocks.length; i < newBlocks.length; i++) {
+ Block newBlock = newBlocks[i];
+ BlockInfo newBI;
+ if (!op.shouldCompleteLastBlock()) {
+ // TODO: shouldn't this only be true for the last block?
+ // what about an old-version fsync() where fsync isn't called
+ // until several blocks in?
+ newBI = new BlockInfoUnderConstruction(
+ newBlock, file.getReplication());
+ } else {
+ // OP_CLOSE should add finalized blocks. This code path
+ // is only executed when loading edits written by prior
+ // versions of Hadoop. Current versions always log
+ // OP_ADD operations as each block is allocated.
+ newBI = new BlockInfo(newBlock, file.getReplication());
+ }
+ fsNamesys.getBlockManager().addINode(newBI, file);
+ file.addBlock(newBI);
+ fsNamesys.getBlockManager().processQueuedMessagesForBlock(newBlock);
+ }
+ }
+ }
private static void dumpOpCounts(
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
@@ -517,19 +657,21 @@
FSImage.LOG.debug("Caught exception after reading " + numValid +
" ops from " + in + " while determining its valid length.", t);
}
- return new EditLogValidation(lastPos, firstTxId, lastTxId);
+ return new EditLogValidation(lastPos, firstTxId, lastTxId, false);
}
static class EditLogValidation {
- private long validLength;
- private long startTxId;
- private long endTxId;
+ private final long validLength;
+ private final long startTxId;
+ private final long endTxId;
+ private final boolean corruptionDetected;
- EditLogValidation(long validLength,
- long startTxId, long endTxId) {
+ EditLogValidation(long validLength, long startTxId, long endTxId,
+ boolean corruptionDetected) {
this.validLength = validLength;
this.startTxId = startTxId;
this.endTxId = endTxId;
+ this.corruptionDetected = corruptionDetected;
}
long getValidLength() { return validLength; }
@@ -545,6 +687,8 @@
}
return (endTxId - startTxId) + 1;
}
+
+ boolean hasCorruptHeader() { return corruptionDetected; }
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
index f075770c..949554d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
@@ -101,6 +101,7 @@
new LogSegmentOp(OP_START_LOG_SEGMENT));
instances.put(OP_END_LOG_SEGMENT,
new LogSegmentOp(OP_END_LOG_SEGMENT));
+ instances.put(OP_UPDATE_BLOCKS, new UpdateBlocksOp());
return instances;
}
};
@@ -128,8 +129,14 @@
abstract void writeFields(DataOutputStream out)
throws IOException;
+ static interface BlockListUpdatingOp {
+ Block[] getBlocks();
+ String getPath();
+ boolean shouldCompleteLastBlock();
+ }
+
@SuppressWarnings("unchecked")
- static abstract class AddCloseOp extends FSEditLogOp {
+ static abstract class AddCloseOp extends FSEditLogOp implements BlockListUpdatingOp {
int length;
String path;
short replication;
@@ -151,6 +158,10 @@
this.path = path;
return (T)this;
}
+
+ public String getPath() {
+ return path;
+ }
<T extends AddCloseOp> T setReplication(short replication) {
this.replication = replication;
@@ -176,6 +187,10 @@
this.blocks = blocks;
return (T)this;
}
+
+ public Block[] getBlocks() {
+ return blocks;
+ }
<T extends AddCloseOp> T setPermissionStatus(PermissionStatus permissions) {
this.permissions = permissions;
@@ -347,6 +362,10 @@
return (AddOp)opInstances.get().get(OP_ADD);
}
+ public boolean shouldCompleteLastBlock() {
+ return false;
+ }
+
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
@@ -365,6 +384,10 @@
return (CloseOp)opInstances.get().get(OP_CLOSE);
}
+ public boolean shouldCompleteLastBlock() {
+ return true;
+ }
+
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
@@ -373,6 +396,68 @@
return builder.toString();
}
}
+
+ static class UpdateBlocksOp extends FSEditLogOp implements BlockListUpdatingOp {
+ String path;
+ Block[] blocks;
+
+ private UpdateBlocksOp() {
+ super(OP_UPDATE_BLOCKS);
+ }
+
+ static UpdateBlocksOp getInstance() {
+ return (UpdateBlocksOp)opInstances.get()
+ .get(OP_UPDATE_BLOCKS);
+ }
+
+
+ UpdateBlocksOp setPath(String path) {
+ this.path = path;
+ return this;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ UpdateBlocksOp setBlocks(Block[] blocks) {
+ this.blocks = blocks;
+ return this;
+ }
+
+ public Block[] getBlocks() {
+ return blocks;
+ }
+
+ @Override
+ void writeFields(DataOutputStream out) throws IOException {
+ FSImageSerialization.writeString(path, out);
+ FSImageSerialization.writeCompactBlockArray(blocks, out);
+ }
+
+ @Override
+ void readFields(DataInputStream in, int logVersion) throws IOException {
+ path = FSImageSerialization.readString(in);
+ this.blocks = FSImageSerialization.readCompactBlockArray(
+ in, logVersion);
+ }
+
+ @Override
+ public boolean shouldCompleteLastBlock() {
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("UpdateBlocksOp [path=")
+ .append(path)
+ .append(", blocks=")
+ .append(Arrays.toString(blocks))
+ .append("]");
+ return sb.toString();
+ }
+ }
static class SetReplicationOp extends FSEditLogOp {
String path;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
index 220c267..1f809c1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
@@ -55,7 +55,8 @@
OP_UPDATE_MASTER_KEY ((byte) 21),
OP_REASSIGN_LEASE ((byte) 22),
OP_END_LOG_SEGMENT ((byte) 23),
- OP_START_LOG_SEGMENT ((byte) 24);
+ OP_START_LOG_SEGMENT ((byte) 24),
+ OP_UPDATE_BLOCKS ((byte) 25);
private byte opCode;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
index 463fca5..7fb3d4b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
@@ -56,6 +56,8 @@
import org.apache.hadoop.hdfs.util.MD5FileUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
@@ -68,7 +70,7 @@
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class FSImage implements Closeable {
- protected static final Log LOG = LogFactory.getLog(FSImage.class.getName());
+ public static final Log LOG = LogFactory.getLog(FSImage.class.getName());
protected FSEditLog editLog = null;
private boolean isUpgradeFinalized = false;
@@ -112,7 +114,8 @@
* @throws IOException if directories are invalid.
*/
protected FSImage(Configuration conf,
- Collection<URI> imageDirs, Collection<URI> editsDirs)
+ Collection<URI> imageDirs,
+ List<URI> editsDirs)
throws IOException {
this.conf = conf;
@@ -123,6 +126,12 @@
}
this.editLog = new FSEditLog(conf, storage, editsDirs);
+ String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
+ if (!HAUtil.isHAEnabled(conf, nameserviceId)) {
+ editLog.initJournalsForWrite();
+ } else {
+ editLog.initSharedJournalsForRead();
+ }
archivalManager = new NNStorageRetentionManager(conf, storage, editLog);
}
@@ -251,6 +260,11 @@
StorageState curState;
try {
curState = sd.analyzeStorage(startOpt, storage);
+ String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
+ if (curState != StorageState.NORMAL && HAUtil.isHAEnabled(conf, nameserviceId)) {
+ throw new IOException("Cannot start an HA namenode with name dirs " +
+ "that need recovery. Dir: " + sd + " state: " + curState);
+ }
// sd is locked but not opened
switch(curState) {
case NON_EXISTENT:
@@ -324,9 +338,9 @@
File prevDir = sd.getPreviousDir();
File tmpDir = sd.getPreviousTmp();
assert curDir.exists() : "Current directory must exist.";
- assert !prevDir.exists() : "prvious directory must not exist.";
- assert !tmpDir.exists() : "prvious.tmp directory must not exist.";
- assert !editLog.isOpen() : "Edits log must not be open.";
+ assert !prevDir.exists() : "previous directory must not exist.";
+ assert !tmpDir.exists() : "previous.tmp directory must not exist.";
+ assert !editLog.isSegmentOpen() : "Edits log must not be open.";
// rename current to tmp
NNStorage.rename(curDir, tmpDir);
@@ -469,7 +483,7 @@
void doImportCheckpoint(FSNamesystem target) throws IOException {
Collection<URI> checkpointDirs =
FSImage.getCheckpointDirs(conf, null);
- Collection<URI> checkpointEditsDirs =
+ List<URI> checkpointEditsDirs =
FSImage.getCheckpointEditsDirs(conf, null);
if (checkpointDirs == null || checkpointDirs.isEmpty()) {
@@ -519,11 +533,9 @@
return editLog;
}
- void openEditLog() throws IOException {
+ void openEditLogForWrite() throws IOException {
assert editLog != null : "editLog must be initialized";
- Preconditions.checkState(!editLog.isOpen(),
- "edit log should not yet be open");
- editLog.open();
+ editLog.openForWrite();
storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId());
};
@@ -564,12 +576,19 @@
Iterable<EditLogInputStream> editStreams = null;
- editLog.recoverUnclosedStreams();
+ if (editLog.isOpenForWrite()) {
+ // We only want to recover streams if we're going into Active mode.
+ editLog.recoverUnclosedStreams();
+ }
if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT,
getLayoutVersion())) {
+ // If we're open for write, we're either non-HA or we're the active NN, so
+ // we better be able to load all the edits. If we're the standby NN, it's
+ // OK to not be able to read all of edits right now.
+ long toAtLeastTxId = editLog.isOpenForWrite() ? inspector.getMaxSeenTxId() : 0;
editStreams = editLog.selectInputStreams(imageFile.getCheckpointTxId() + 1,
- inspector.getMaxSeenTxId());
+ toAtLeastTxId, false);
} else {
editStreams = FSImagePreTransactionalStorageInspector
.getEditLogStreams(storage);
@@ -644,12 +663,12 @@
* Load the specified list of edit files into the image.
* @return the number of transactions loaded
*/
- protected long loadEdits(Iterable<EditLogInputStream> editStreams,
- FSNamesystem target) throws IOException {
+ public long loadEdits(Iterable<EditLogInputStream> editStreams,
+ FSNamesystem target) throws IOException, EditLogInputException {
LOG.debug("About to load edits:\n " + Joiner.on("\n ").join(editStreams));
long startingTxId = getLastAppliedTxId() + 1;
- int numLoaded = 0;
+ long numLoaded = 0;
try {
FSEditLogLoader loader = new FSEditLogLoader(target);
@@ -657,17 +676,26 @@
// Load latest edits
for (EditLogInputStream editIn : editStreams) {
LOG.info("Reading " + editIn + " expecting start txid #" + startingTxId);
- int thisNumLoaded = loader.loadFSEdits(editIn, startingTxId);
- startingTxId += thisNumLoaded;
- numLoaded += thisNumLoaded;
- lastAppliedTxId += thisNumLoaded;
+ long thisNumLoaded = 0;
+ try {
+ thisNumLoaded = loader.loadFSEdits(editIn, startingTxId);
+ } catch (EditLogInputException elie) {
+ thisNumLoaded = elie.getNumEditsLoaded();
+ throw elie;
+ } finally {
+ // Update lastAppliedTxId even in case of error, since some ops may
+ // have been successfully applied before the error.
+ lastAppliedTxId = startingTxId + thisNumLoaded - 1;
+ startingTxId += thisNumLoaded;
+ numLoaded += thisNumLoaded;
+ }
}
} finally {
FSEditLog.closeAllStreams(editStreams);
+ // update the counts
+ target.dir.updateCountForINodeWithQuota();
}
-
- // update the counts
- target.dir.updateCountForINodeWithQuota();
+
return numLoaded;
}
@@ -688,8 +716,7 @@
/**
* Load in the filesystem image from file. It's a big list of
- * filenames and blocks. Return whether we should
- * "re-save" and consolidate the edit-logs
+ * filenames and blocks.
*/
private void loadFSImage(File curFile, MD5Hash expectedMd5,
FSNamesystem target) throws IOException {
@@ -786,16 +813,16 @@
* Save the contents of the FS image to a new image file in each of the
* current storage directories.
*/
- synchronized void saveNamespace(FSNamesystem source) throws IOException {
+ public synchronized void saveNamespace(FSNamesystem source) throws IOException {
assert editLog != null : "editLog must be initialized";
storage.attemptRestoreRemovedStorage();
- boolean editLogWasOpen = editLog.isOpen();
+ boolean editLogWasOpen = editLog.isSegmentOpen();
if (editLogWasOpen) {
editLog.endCurrentLogSegment(true);
}
- long imageTxId = editLog.getLastWrittenTxId();
+ long imageTxId = getLastAppliedOrWrittenTxId();
try {
saveFSImageInAllDirs(source, imageTxId);
storage.writeAll();
@@ -812,7 +839,7 @@
}
- void cancelSaveNamespace(String reason)
+ public void cancelSaveNamespace(String reason)
throws InterruptedException {
SaveNamespaceContext ctx = curSaveNamespaceContext;
if (ctx != null) {
@@ -1061,7 +1088,7 @@
return Util.stringCollectionAsURIs(dirNames);
}
- static Collection<URI> getCheckpointEditsDirs(Configuration conf,
+ static List<URI> getCheckpointEditsDirs(Configuration conf,
String defaultName) {
Collection<String> dirNames =
conf.getStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY);
@@ -1095,4 +1122,16 @@
return lastAppliedTxId;
}
+ public long getLastAppliedOrWrittenTxId() {
+ return Math.max(lastAppliedTxId,
+ editLog != null ? editLog.getLastWrittenTxId() : 0);
+ }
+
+ public void updateLastAppliedTxIdFromWritten() {
+ this.lastAppliedTxId = editLog.getLastWrittenTxId();
+ }
+
+ public synchronized long getMostRecentCheckpointTxId() {
+ return storage.getMostRecentCheckpointTxId();
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java
index 5b480305..f508433 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageSerialization.java
@@ -40,6 +40,7 @@
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
/**
* Static utility functions for serializing various pieces of data in the correct
@@ -277,6 +278,49 @@
ustr.getLength(), (byte) Path.SEPARATOR_CHAR);
}
+
+ /**
+ * Write an array of blocks as compactly as possible. This uses
+ * delta-encoding for the generation stamp and size, following
+ * the principle that genstamp increases relatively slowly,
+ * and size is equal for all but the last block of a file.
+ */
+ public static void writeCompactBlockArray(
+ Block[] blocks, DataOutputStream out) throws IOException {
+ WritableUtils.writeVInt(out, blocks.length);
+ Block prev = null;
+ for (Block b : blocks) {
+ long szDelta = b.getNumBytes() -
+ (prev != null ? prev.getNumBytes() : 0);
+ long gsDelta = b.getGenerationStamp() -
+ (prev != null ? prev.getGenerationStamp() : 0);
+ out.writeLong(b.getBlockId()); // blockid is random
+ WritableUtils.writeVLong(out, szDelta);
+ WritableUtils.writeVLong(out, gsDelta);
+ prev = b;
+ }
+ }
+
+ public static Block[] readCompactBlockArray(
+ DataInputStream in, int logVersion) throws IOException {
+ int num = WritableUtils.readVInt(in);
+ if (num < 0) {
+ throw new IOException("Invalid block array length: " + num);
+ }
+ Block prev = null;
+ Block[] ret = new Block[num];
+ for (int i = 0; i < num; i++) {
+ long id = in.readLong();
+ long sz = WritableUtils.readVLong(in) +
+ ((prev != null) ? prev.getNumBytes() : 0);
+ long gs = WritableUtils.readVLong(in) +
+ ((prev != null) ? prev.getGenerationStamp() : 0);
+ ret[i] = new Block(id, sz, gs);
+ prev = ret[i];
+ }
+ return ret;
+ }
+
/**
* DatanodeImage is used to store persistent information
* about datanodes into the fsImage.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
index 604fbb0..f22f808 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@@ -32,6 +32,8 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_MAX_OBJECTS_DEFAULT;
@@ -47,10 +49,15 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SAFEMODE_THRESHOLD_PCT_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_DEFAULT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_STANDBY_CHECKPOINTS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERSIST_BLOCKS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
@@ -68,6 +75,7 @@
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
+import java.io.StringWriter;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.URI;
@@ -80,6 +88,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -108,7 +117,10 @@
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
import org.apache.hadoop.hdfs.protocol.Block;
@@ -147,9 +159,18 @@
import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
+import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState;
+import org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer;
+import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
+import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
+import org.apache.hadoop.hdfs.server.namenode.ha.StandbyCheckpointer;
+import org.apache.hadoop.hdfs.server.namenode.ha.StandbyState;
import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
@@ -157,6 +178,7 @@
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.Server;
+import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -170,13 +192,12 @@
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.delegation.DelegationKey;
import org.apache.hadoop.util.Daemon;
-import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON;
-import com.google.common.base.Preconditions;
-
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
/***************************************************
* FSNamesystem does the actual bookkeeping work for the
@@ -194,7 +215,7 @@
@Metrics(context="dfs")
public class FSNamesystem implements Namesystem, FSClusterStats,
FSNamesystemMBean, NameNodeMXBean {
- static final Log LOG = LogFactory.getLog(FSNamesystem.class);
+ public static final Log LOG = LogFactory.getLog(FSNamesystem.class);
private static final ThreadLocal<StringBuilder> auditBuffer =
new ThreadLocal<StringBuilder>() {
@@ -243,14 +264,18 @@
static final int DEFAULT_MAX_CORRUPT_FILEBLOCKS_RETURNED = 100;
static int BLOCK_DELETION_INCREMENT = 1000;
private boolean isPermissionEnabled;
+ private boolean persistBlocks;
private UserGroupInformation fsOwner;
private String supergroup;
private PermissionStatus defaultPermission;
+ private boolean standbyShouldCheckpoint;
// Scan interval is not configurable.
private static final long DELEGATION_TOKEN_REMOVER_SCAN_INTERVAL =
TimeUnit.MILLISECONDS.convert(1, TimeUnit.HOURS);
private DelegationTokenSecretManager dtSecretManager;
+ private boolean alwaysUseDelegationTokensForTests;
+
//
// Stores the correct file name hierarchy
@@ -264,7 +289,6 @@
LeaseManager leaseManager = new LeaseManager(this);
- Daemon lmthread = null; // LeaseMonitor thread
Daemon smmthread = null; // SafeModeMonitor thread
Daemon nnrmthread = null; // NamenodeResourceMonitor thread
@@ -300,7 +324,26 @@
// lock to protect FSNamesystem.
private ReentrantReadWriteLock fsLock;
-
+ /**
+ * Used when this NN is in standby state to read from the shared edit log.
+ */
+ private EditLogTailer editLogTailer = null;
+
+ /**
+ * Used when this NN is in standby state to perform checkpoints.
+ */
+ private StandbyCheckpointer standbyCheckpointer;
+
+ /**
+ * Reference to the NN's HAContext object. This is only set once
+ * {@link #startCommonServices(Configuration, HAContext)} is called.
+ */
+ private HAContext haContext;
+
+ private boolean haEnabled;
+
+ private final Configuration conf;
+
/**
* Instantiates an FSNamesystem loaded from the image and edits
* directories specified in the passed Configuration.
@@ -310,9 +353,10 @@
* @return an FSNamesystem which contains the loaded namespace
* @throws IOException if loading fails
*/
- public static FSNamesystem loadFromDisk(Configuration conf) throws IOException {
+ public static FSNamesystem loadFromDisk(Configuration conf)
+ throws IOException {
Collection<URI> namespaceDirs = FSNamesystem.getNamespaceDirs(conf);
- Collection<URI> namespaceEditsDirs =
+ List<URI> namespaceEditsDirs =
FSNamesystem.getNamespaceEditsDirs(conf);
if (namespaceDirs.size() == 1) {
@@ -329,7 +373,9 @@
long loadStart = now();
StartupOption startOpt = NameNode.getStartupOption(conf);
- namesystem.loadFSImage(startOpt, fsImage);
+ String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
+ namesystem.loadFSImage(startOpt, fsImage,
+ HAUtil.isHAEnabled(conf, nameserviceId));
long timeTakenToLoadFSImage = now() - loadStart;
LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
NameNode.getNameNodeMetrics().setFsImageLoadTime(
@@ -348,6 +394,7 @@
* @throws IOException on bad configuration
*/
FSNamesystem(Configuration conf, FSImage fsImage) throws IOException {
+ this.conf = conf;
try {
initialize(conf, fsImage);
} catch(IOException e) {
@@ -375,7 +422,7 @@
this.safeMode = new SafeModeInfo(conf);
}
- void loadFSImage(StartupOption startOpt, FSImage fsImage)
+ void loadFSImage(StartupOption startOpt, FSImage fsImage, boolean haEnabled)
throws IOException {
// format before starting up if requested
if (startOpt == StartupOption.FORMAT) {
@@ -385,43 +432,71 @@
startOpt = StartupOption.REGULAR;
}
boolean success = false;
+ writeLock();
try {
- if (fsImage.recoverTransitionRead(startOpt, this)) {
+ // We shouldn't be calling saveNamespace if we've come up in standby state.
+ if (fsImage.recoverTransitionRead(startOpt, this) && !haEnabled) {
fsImage.saveNamespace(this);
}
- fsImage.openEditLog();
+ // This will start a new log segment and write to the seen_txid file, so
+ // we shouldn't do it when coming up in standby state
+ if (!haEnabled) {
+ fsImage.openEditLogForWrite();
+ }
success = true;
} finally {
if (!success) {
fsImage.close();
}
+ writeUnlock();
}
dir.imageLoadComplete();
}
- void activateSecretManager() throws IOException {
+ private void startSecretManager() {
if (dtSecretManager != null) {
- dtSecretManager.startThreads();
+ try {
+ dtSecretManager.startThreads();
+ } catch (IOException e) {
+ // Inability to start secret manager
+ // can't be recovered from.
+ throw new RuntimeException(e);
+ }
}
}
- /**
- * Activate FSNamesystem daemons.
- */
- void activate(Configuration conf) throws IOException {
- this.registerMBean(); // register the MBean for the FSNamesystemState
+ private void startSecretManagerIfNecessary() {
+ boolean shouldRun = shouldUseDelegationTokens() &&
+ !isInSafeMode() && getEditLog().isOpenForWrite();
+ boolean running = dtSecretManager.isRunning();
+ if (shouldRun && !running) {
+ startSecretManager();
+ }
+ }
+ private void stopSecretManager() {
+ if (dtSecretManager != null) {
+ dtSecretManager.stopThreads();
+ }
+ }
+
+ /**
+ * Start services common to both active and standby states
+ * @param haContext
+ * @throws IOException
+ */
+ void startCommonServices(Configuration conf, HAContext haContext) throws IOException {
+ this.registerMBean(); // register the MBean for the FSNamesystemState
writeLock();
+ this.haContext = haContext;
try {
nnResourceChecker = new NameNodeResourceChecker(conf);
checkAvailableResources();
-
+ assert safeMode != null &&
+ !safeMode.isPopulatingReplQueues();
setBlockTotal();
blockManager.activate(conf);
-
- this.lmthread = new Daemon(leaseManager.new Monitor());
- lmthread.start();
this.nnrmthread = new Daemon(new NameNodeResourceMonitor());
nnrmthread.start();
} finally {
@@ -431,24 +506,169 @@
registerMXBean();
DefaultMetricsSystem.instance().register(this);
}
-
- public static Collection<URI> getNamespaceDirs(Configuration conf) {
- return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
- }
- public static Collection<URI> getNamespaceEditsDirs(Configuration conf) {
- Collection<URI> editsDirs = getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY);
- if (editsDirs.isEmpty()) {
- // If this is the case, no edit dirs have been explicitly configured.
- // Image dirs are to be used for edits too.
- return getNamespaceDirs(conf);
- } else {
- return editsDirs;
+ /**
+ * Stop services common to both active and standby states
+ * @throws IOException
+ */
+ void stopCommonServices() {
+ writeLock();
+ try {
+ if (blockManager != null) blockManager.close();
+ if (nnrmthread != null) nnrmthread.interrupt();
+ } finally {
+ writeUnlock();
}
}
+ /**
+ * Start services required in active state
+ * @throws IOException
+ */
+ void startActiveServices() throws IOException {
+ LOG.info("Starting services required for active state");
+ writeLock();
+ try {
+ FSEditLog editLog = dir.fsImage.getEditLog();
+
+ if (!editLog.isOpenForWrite()) {
+ // During startup, we're already open for write during initialization.
+ editLog.initJournalsForWrite();
+ // May need to recover
+ editLog.recoverUnclosedStreams();
+
+ LOG.info("Catching up to latest edits from old active before " +
+ "taking over writer role in edits logs.");
+ editLogTailer.catchupDuringFailover();
+
+ LOG.info("Reprocessing replication and invalidation queues...");
+ blockManager.getDatanodeManager().markAllDatanodesStale();
+ blockManager.clearQueues();
+ blockManager.processAllPendingDNMessages();
+ blockManager.processMisReplicatedBlocks();
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("NameNode metadata after re-processing " +
+ "replication and invalidation queues during failover:\n" +
+ metaSaveAsString());
+ }
+
+ long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
+ LOG.info("Will take over writing edit logs at txnid " +
+ nextTxId);
+ editLog.setNextTxId(nextTxId);
+
+ dir.fsImage.editLog.openForWrite();
+ }
+ if (haEnabled) {
+ // Renew all of the leases before becoming active.
+ // This is because, while we were in standby mode,
+ // the leases weren't getting renewed on this NN.
+ // Give them all a fresh start here.
+ leaseManager.renewAllLeases();
+ }
+ leaseManager.startMonitor();
+ startSecretManagerIfNecessary();
+ } finally {
+ writeUnlock();
+ }
+ }
+
+ private boolean shouldUseDelegationTokens() {
+ return UserGroupInformation.isSecurityEnabled() ||
+ alwaysUseDelegationTokensForTests;
+ }
+
+ /**
+ * Stop services required in active state
+ * @throws InterruptedException
+ */
+ void stopActiveServices() {
+ LOG.info("Stopping services started for active state");
+ writeLock();
+ try {
+ stopSecretManager();
+ if (leaseManager != null) {
+ leaseManager.stopMonitor();
+ }
+ if (dir != null && dir.fsImage != null) {
+ if (dir.fsImage.editLog != null) {
+ dir.fsImage.editLog.close();
+ }
+ // Update the fsimage with the last txid that we wrote
+ // so that the tailer starts from the right spot.
+ dir.fsImage.updateLastAppliedTxIdFromWritten();
+ }
+ } finally {
+ writeUnlock();
+ }
+ }
+
+ /** Start services required in standby state */
+ void startStandbyServices() {
+ LOG.info("Starting services required for standby state");
+ if (!dir.fsImage.editLog.isOpenForRead()) {
+ // During startup, we're already open for read.
+ dir.fsImage.editLog.initSharedJournalsForRead();
+ }
+ editLogTailer = new EditLogTailer(this);
+ editLogTailer.start();
+ if (standbyShouldCheckpoint) {
+ standbyCheckpointer = new StandbyCheckpointer(conf, this);
+ standbyCheckpointer.start();
+ }
+ }
+
+
+ /**
+ * Called while the NN is in Standby state, but just about to be
+ * asked to enter Active state. This cancels any checkpoints
+ * currently being taken.
+ */
+ void prepareToStopStandbyServices() throws ServiceFailedException {
+ if (standbyCheckpointer != null) {
+ standbyCheckpointer.cancelAndPreventCheckpoints();
+ }
+ }
+
+ /** Stop services required in standby state */
+ void stopStandbyServices() throws IOException {
+ LOG.info("Stopping services started for standby state");
+ if (standbyCheckpointer != null) {
+ standbyCheckpointer.stop();
+ }
+ if (editLogTailer != null) {
+ editLogTailer.stop();
+ }
+ if (dir != null && dir.fsImage != null && dir.fsImage.editLog != null) {
+ dir.fsImage.editLog.close();
+ }
+ }
+
+
+ void checkOperation(OperationCategory op) throws StandbyException {
+ if (haContext != null) {
+ // null in some unit tests
+ haContext.checkOperation(op);
+ }
+ }
+
+ public static Collection<URI> getNamespaceDirs(Configuration conf) {
+ return getStorageDirs(conf, DFS_NAMENODE_NAME_DIR_KEY);
+ }
+
+ /**
+ * Get all edits dirs which are required. If any shared edits dirs are
+ * configured, these are also included in the set of required dirs.
+ *
+ * @param conf the HDFS configuration.
+ * @return all required dirs.
+ */
public static Collection<URI> getRequiredNamespaceEditsDirs(Configuration conf) {
- return getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY);
+ Set<URI> ret = new HashSet<URI>();
+ ret.addAll(getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY));
+ ret.addAll(getSharedEditsDirs(conf));
+ return ret;
}
private static Collection<URI> getStorageDirs(Configuration conf,
@@ -481,6 +701,75 @@
return Util.stringCollectionAsURIs(dirNames);
}
+ /**
+ * Return an ordered list of edits directories to write to.
+ * The list is ordered such that all shared edits directories
+ * are ordered before non-shared directories, and any duplicates
+ * are removed. The order they are specified in the configuration
+ * is retained.
+ * @return Collection of shared edits directories.
+ * @throws IOException if multiple shared edits directories are configured
+ */
+ public static List<URI> getNamespaceEditsDirs(Configuration conf)
+ throws IOException {
+ // Use a LinkedHashSet so that order is maintained while we de-dup
+ // the entries.
+ LinkedHashSet<URI> editsDirs = new LinkedHashSet<URI>();
+
+ List<URI> sharedDirs = getSharedEditsDirs(conf);
+
+ // Fail until multiple shared edits directories are supported (HDFS-2782)
+ if (sharedDirs.size() > 1) {
+ throw new IOException(
+ "Multiple shared edits directories are not yet supported");
+ }
+
+ // First add the shared edits dirs. It's critical that the shared dirs
+ // are added first, since JournalSet syncs them in the order they are listed,
+ // and we need to make sure all edits are in place in the shared storage
+ // before they are replicated locally. See HDFS-2874.
+ for (URI dir : sharedDirs) {
+ if (!editsDirs.add(dir)) {
+ LOG.warn("Edits URI " + dir + " listed multiple times in " +
+ DFS_NAMENODE_SHARED_EDITS_DIR_KEY + ". Ignoring duplicates.");
+ }
+ }
+
+ // Now add the non-shared dirs.
+ for (URI dir : getStorageDirs(conf, DFS_NAMENODE_EDITS_DIR_KEY)) {
+ if (!editsDirs.add(dir)) {
+ LOG.warn("Edits URI " + dir + " listed multiple times in " +
+ DFS_NAMENODE_SHARED_EDITS_DIR_KEY + " and " +
+ DFS_NAMENODE_EDITS_DIR_KEY + ". Ignoring duplicates.");
+ }
+ }
+
+ if (editsDirs.isEmpty()) {
+ // If this is the case, no edit dirs have been explicitly configured.
+ // Image dirs are to be used for edits too.
+ return Lists.newArrayList(getNamespaceDirs(conf));
+ } else {
+ return Lists.newArrayList(editsDirs);
+ }
+ }
+
+ /**
+ * Returns edit directories that are shared between primary and secondary.
+ * @param conf
+ * @return Collection of edit directories.
+ */
+ public static List<URI> getSharedEditsDirs(Configuration conf) {
+ // don't use getStorageDirs here, because we want an empty default
+ // rather than the dir in /tmp
+ Collection<String> dirNames = conf.getTrimmedStringCollection(
+ DFS_NAMENODE_SHARED_EDITS_DIR_KEY);
+ return Util.stringCollectionAsURIs(dirNames);
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
@Override
public void readLock() {
this.fsLock.readLock().lock();
@@ -494,6 +783,10 @@
this.fsLock.writeLock().lock();
}
@Override
+ public void writeLockInterruptibly() throws InterruptedException {
+ this.fsLock.writeLock().lockInterruptibly();
+ }
+ @Override
public void writeUnlock() {
this.fsLock.writeLock().unlock();
}
@@ -526,6 +819,26 @@
DFS_PERMISSIONS_ENABLED_DEFAULT);
LOG.info("supergroup=" + supergroup);
LOG.info("isPermissionEnabled=" + isPermissionEnabled);
+
+ this.persistBlocks = conf.getBoolean(DFS_PERSIST_BLOCKS_KEY,
+ DFS_PERSIST_BLOCKS_DEFAULT);
+ // block allocation has to be persisted in HA using a shared edits directory
+ // so that the standby has up-to-date namespace information
+ String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
+ this.haEnabled = HAUtil.isHAEnabled(conf, nameserviceId);
+ this.persistBlocks |= haEnabled && HAUtil.usesSharedEditsDir(conf);
+
+ // Sanity check the HA-related config.
+ if (nameserviceId != null) {
+ LOG.info("Determined nameservice ID: " + nameserviceId);
+ }
+ LOG.info("HA Enabled: " + haEnabled);
+ if (!haEnabled && HAUtil.usesSharedEditsDir(conf)) {
+ LOG.warn("Configured NNs:\n" + DFSUtil.nnAddressesAsString(conf));
+ throw new IOException("Invalid configuration: a shared edits dir " +
+ "must not be specified if HA is not enabled.");
+ }
+
short filePermission = (short)conf.getInt(DFS_NAMENODE_UPGRADE_PERMISSION_KEY,
DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT);
this.defaultPermission = PermissionStatus.createImmutable(
@@ -546,6 +859,16 @@
DFS_SUPPORT_APPEND_DEFAULT);
this.dtpReplaceDatanodeOnFailure = ReplaceDatanodeOnFailure.get(conf);
+
+ this.standbyShouldCheckpoint = conf.getBoolean(
+ DFS_HA_STANDBY_CHECKPOINTS_KEY,
+ DFS_HA_STANDBY_CHECKPOINTS_DEFAULT);
+
+ // For testing purposes, allow the DT secret manager to be started regardless
+ // of whether security is enabled.
+ alwaysUseDelegationTokensForTests =
+ conf.getBoolean(DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY,
+ DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_DEFAULT);
}
/**
@@ -566,7 +889,7 @@
}
/**
- * Version of {@see #getNamespaceInfo()} that is not protected by a lock.
+ * Version of @see #getNamespaceInfo() that is not protected by a lock.
*/
NamespaceInfo unprotectedGetNamespaceInfo() {
return new NamespaceInfo(dir.fsImage.getStorage().getNamespaceID(),
@@ -583,23 +906,16 @@
void close() {
fsRunning = false;
try {
- if (blockManager != null) blockManager.close();
+ stopCommonServices();
if (smmthread != null) smmthread.interrupt();
- if (dtSecretManager != null) dtSecretManager.stopThreads();
- if (nnrmthread != null) nnrmthread.interrupt();
- } catch (Exception e) {
- LOG.warn("Exception shutting down FSNamesystem", e);
} finally {
// using finally to ensure we also wait for lease daemon
try {
- if (lmthread != null) {
- lmthread.interrupt();
- lmthread.join(3000);
- }
+ stopActiveServices();
+ stopStandbyServices();
if (dir != null) {
dir.close();
}
- } catch (InterruptedException ie) {
} catch (IOException ie) {
LOG.error("Error closing FSDirectory", ie);
IOUtils.cleanup(LOG, dir);
@@ -611,6 +927,18 @@
public boolean isRunning() {
return fsRunning;
}
+
+ @Override
+ public boolean isInStandbyState() {
+ if (haContext == null || haContext.getState() == null) {
+ // We're still starting up. In this case, if HA is
+ // on for the cluster, we always start in standby. Otherwise
+ // start in active.
+ return haEnabled;
+ }
+
+ return haContext.getState() instanceof StandbyState;
+ }
/**
* Dump all metadata into specified file
@@ -622,14 +950,7 @@
File file = new File(System.getProperty("hadoop.log.dir"), filename);
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file,
true)));
-
- long totalInodes = this.dir.totalInodes();
- long totalBlocks = this.getBlocksTotal();
- out.println(totalInodes + " files and directories, " + totalBlocks
- + " blocks = " + (totalInodes + totalBlocks) + " total");
-
- blockManager.metaSave(out);
-
+ metaSave(out);
out.flush();
out.close();
} finally {
@@ -637,11 +958,31 @@
}
}
+ private void metaSave(PrintWriter out) {
+ assert hasWriteLock();
+ long totalInodes = this.dir.totalInodes();
+ long totalBlocks = this.getBlocksTotal();
+ out.println(totalInodes + " files and directories, " + totalBlocks
+ + " blocks = " + (totalInodes + totalBlocks) + " total");
+
+ blockManager.metaSave(out);
+ }
+
+ private String metaSaveAsString() {
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw);
+ metaSave(pw);
+ pw.flush();
+ return sw.toString();
+ }
+
+
long getDefaultBlockSize() {
return serverDefaults.getBlockSize();
}
- FsServerDefaults getServerDefaults() {
+ FsServerDefaults getServerDefaults() throws StandbyException {
+ checkOperation(OperationCategory.READ);
return serverDefaults;
}
@@ -668,6 +1009,8 @@
HdfsFileStatus resultingStat = null;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot set permission for " + src, safeMode);
}
@@ -697,6 +1040,8 @@
HdfsFileStatus resultingStat = null;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot set owner for " + src, safeMode);
}
@@ -787,13 +1132,14 @@
} else { // second attempt is with write lock
writeLock(); // writelock is needed to set accesstime
}
-
- // if the namenode is in safemode, then do not update access time
- if (isInSafeMode()) {
- doAccessTime = false;
- }
-
try {
+ checkOperation(OperationCategory.READ);
+
+ // if the namenode is in safemode, then do not update access time
+ if (isInSafeMode()) {
+ doAccessTime = false;
+ }
+
long now = now();
INodeFile inode = dir.getFileINode(src);
if (inode == null) {
@@ -861,6 +1207,7 @@
HdfsFileStatus resultingStat = null;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot concat " + target, safeMode);
}
@@ -992,6 +1339,8 @@
}
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
// Write access is required to set access and modification times
if (isPermissionEnabled) {
checkPathAccess(src, FsAction.WRITE);
@@ -1022,6 +1371,8 @@
HdfsFileStatus resultingStat = null;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (!createParent) {
verifyParentDir(link);
}
@@ -1091,6 +1442,8 @@
final boolean isFile;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot set replication for " + src, safeMode);
}
@@ -1121,6 +1474,7 @@
throws IOException, UnresolvedLinkException {
readLock();
try {
+ checkOperation(OperationCategory.READ);
if (isPermissionEnabled) {
checkTraverse(filename);
}
@@ -1163,6 +1517,8 @@
FileNotFoundException, ParentNotDirectoryException, IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
startFileInternal(src, permissions, holder, clientMachine, flag,
createParent, replication, blockSize);
} finally {
@@ -1266,30 +1622,8 @@
blockManager.getDatanodeManager().getDatanodeByHost(clientMachine);
if (append && myFile != null) {
- //
- // Replace current node with a INodeUnderConstruction.
- // Recreate in-memory lease record.
- //
- INodeFile node = (INodeFile) myFile;
- INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
- node.getLocalNameBytes(),
- node.getReplication(),
- node.getModificationTime(),
- node.getPreferredBlockSize(),
- node.getBlocks(),
- node.getPermissionStatus(),
- holder,
- clientMachine,
- clientNode);
- dir.replaceNode(src, node, cons);
- leaseManager.addLease(cons.getClientName(), src);
-
- // convert last block to under-construction
- LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
-
- // add append file record to log, record lease, etc.
- getEditLog().logOpenFile(src, cons);
- return ret;
+ return prepareFileForWrite(
+ src, myFile, holder, clientMachine, clientNode, true);
} else {
// Now we can add the name to the filesystem. This file has no
// blocks associated with it.
@@ -1320,6 +1654,45 @@
}
return null;
}
+
+ /**
+ * Replace current node with a INodeUnderConstruction.
+ * Recreate in-memory lease record.
+ *
+ * @param src path to the file
+ * @param file existing file object
+ * @param leaseHolder identifier of the lease holder on this file
+ * @param clientMachine identifier of the client machine
+ * @param clientNode if the client is collocated with a DN, that DN's descriptor
+ * @param writeToEditLog whether to persist this change to the edit log
+ * @return the last block locations if the block is partial or null otherwise
+ * @throws UnresolvedLinkException
+ * @throws IOException
+ */
+ public LocatedBlock prepareFileForWrite(String src, INode file,
+ String leaseHolder, String clientMachine, DatanodeDescriptor clientNode,
+ boolean writeToEditLog)
+ throws UnresolvedLinkException, IOException {
+ INodeFile node = (INodeFile) file;
+ INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
+ node.getLocalNameBytes(),
+ node.getReplication(),
+ node.getModificationTime(),
+ node.getPreferredBlockSize(),
+ node.getBlocks(),
+ node.getPermissionStatus(),
+ leaseHolder,
+ clientMachine,
+ clientNode);
+ dir.replaceNode(src, node, cons);
+ leaseManager.addLease(cons.getClientName(), src);
+
+ LocatedBlock ret = blockManager.convertLastBlockToUnderConstruction(cons);
+ if (writeToEditLog) {
+ getEditLog().logOpenFile(src, cons);
+ }
+ return ret;
+ }
/**
* Recover lease;
@@ -1336,6 +1709,8 @@
throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException(
"Cannot recover the lease of " + src, safeMode);
@@ -1455,6 +1830,8 @@
LocatedBlock lb = null;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
lb = startFileInternal(src, null, holder, clientMachine,
EnumSet.of(CreateFlag.APPEND),
false, blockManager.maxReplication, 0);
@@ -1519,6 +1896,8 @@
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot add block to " + src, safeMode);
}
@@ -1552,6 +1931,7 @@
// Allocate a new block and record it in the INode.
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot add block to " + src, safeMode);
}
@@ -1570,10 +1950,14 @@
for (DatanodeDescriptor dn : targets) {
dn.incBlocksScheduled();
- }
+ }
+ dir.persistBlocks(src, pendingFile);
} finally {
writeUnlock();
}
+ if (persistBlocks) {
+ getEditLog().logSync();
+ }
// Create next block
LocatedBlock b = new LocatedBlock(getExtendedBlock(newBlock), targets, fileLength);
@@ -1594,6 +1978,7 @@
final List<DatanodeDescriptor> chosen;
readLock();
try {
+ checkOperation(OperationCategory.WRITE);
//check safe mode
if (isInSafeMode()) {
throw new SafeModeException("Cannot add datanode; src=" + src
@@ -1635,6 +2020,7 @@
UnresolvedLinkException, IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
//
// Remove the block from the pending creates list
//
@@ -1652,10 +2038,15 @@
NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
+ b + " is removed from pendingCreates");
}
- return true;
+ dir.persistBlocks(src, file);
} finally {
writeUnlock();
}
+ if (persistBlocks) {
+ getEditLog().logSync();
+ }
+
+ return true;
}
// make sure that we still have the lease on this file.
@@ -1705,6 +2096,8 @@
boolean success = false;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
success = completeFileInternal(src, holder,
ExtendedBlock.getLocalBlock(last));
} finally {
@@ -1764,12 +2157,15 @@
* @throws QuotaExceededException If addition of block exceeds space quota
*/
private Block allocateBlock(String src, INode[] inodes,
- DatanodeDescriptor targets[]) throws QuotaExceededException {
+ DatanodeDescriptor targets[]) throws QuotaExceededException,
+ SafeModeException {
assert hasWriteLock();
Block b = new Block(DFSUtil.getRandom().nextLong(), 0, 0);
while(isValidBlock(b)) {
b.setBlockId(DFSUtil.getRandom().nextLong());
}
+ // Increment the generation stamp for every new block.
+ nextGenerationStamp();
b.setGenerationStamp(getGenerationStamp());
b = dir.addBlock(src, inodes, b, targets);
NameNode.stateChangeLog.info("BLOCK* NameSystem.allocateBlock: "
@@ -1841,6 +2237,8 @@
}
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
status = renameToInternal(src, dst);
if (status && auditLog.isInfoEnabled() && isExternalInvocation()) {
resultingStat = dir.getFileInfo(dst, false);
@@ -1896,6 +2294,8 @@
}
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
renameToInternal(src, dst, options);
if (auditLog.isInfoEnabled() && isExternalInvocation()) {
resultingStat = dir.getFileInfo(dst, false);
@@ -1973,6 +2373,7 @@
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot delete " + src, safeMode);
}
@@ -2028,9 +2429,45 @@
if (blocks == null) {
return;
}
- for(Block b : blocks) {
+
+ // In the case that we are a Standby tailing edits from the
+ // active while in safe-mode, we need to track the total number
+ // of blocks and safe blocks in the system.
+ boolean trackBlockCounts = isSafeModeTrackingBlocks();
+ int numRemovedComplete = 0, numRemovedSafe = 0;
+
+ for (Block b : blocks) {
+ if (trackBlockCounts) {
+ BlockInfo bi = blockManager.getStoredBlock(b);
+ if (bi.isComplete()) {
+ numRemovedComplete++;
+ if (bi.numNodes() >= blockManager.minReplication) {
+ numRemovedSafe++;
+ }
+ }
+ }
blockManager.removeBlock(b);
}
+ if (trackBlockCounts) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Adjusting safe-mode totals for deletion of " + src + ":" +
+ "decreasing safeBlocks by " + numRemovedSafe +
+ ", totalBlocks by " + numRemovedComplete);
+ }
+ adjustSafeModeBlockTotals(-numRemovedSafe, -numRemovedComplete);
+ }
+ }
+
+ /**
+ * @see SafeModeInfo#shouldIncrementallyTrackBlocks
+ */
+ private boolean isSafeModeTrackingBlocks() {
+ if (!haEnabled) {
+ // Never track blocks incrementally in non-HA code.
+ return false;
+ }
+ SafeModeInfo sm = this.safeMode;
+ return sm != null && sm.shouldIncrementallyTrackBlocks();
}
/**
@@ -2045,11 +2482,15 @@
*
* @return object containing information regarding the file
* or null if file not found
+ * @throws StandbyException
*/
HdfsFileStatus getFileInfo(String src, boolean resolveLink)
- throws AccessControlException, UnresolvedLinkException {
+ throws AccessControlException, UnresolvedLinkException,
+ StandbyException {
readLock();
try {
+ checkOperation(OperationCategory.READ);
+
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException("Invalid file name: " + src);
}
@@ -2073,6 +2514,8 @@
}
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
status = mkdirsInternal(src, permissions, createParent);
} finally {
writeUnlock();
@@ -2127,9 +2570,11 @@
}
ContentSummary getContentSummary(String src) throws AccessControlException,
- FileNotFoundException, UnresolvedLinkException {
+ FileNotFoundException, UnresolvedLinkException, StandbyException {
readLock();
try {
+ checkOperation(OperationCategory.READ);
+
if (isPermissionEnabled) {
checkPermission(src, false, null, null, null, FsAction.READ_EXECUTE);
}
@@ -2148,6 +2593,7 @@
throws IOException, UnresolvedLinkException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot set quota on " + path, safeMode);
}
@@ -2172,6 +2618,7 @@
+ src + " for " + clientName);
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot fsync file " + src, safeMode);
}
@@ -2381,6 +2828,10 @@
String src = "";
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+ // If a DN tries to commit to the standby, the recovery will
+ // fail, and the next retry will succeed on the new NN.
+
if (isInSafeMode()) {
throw new SafeModeException(
"Cannot commitBlockSynchronization while in safe mode",
@@ -2455,8 +2906,8 @@
//remove lease, close file
finalizeINodeFileUnderConstruction(src, pendingFile);
} else if (supportAppends) {
- // If this commit does not want to close the file, persist
- // blocks only if append is supported
+ // If this commit does not want to close the file, persist blocks
+ // only if append is supported or we're explicitly told to
dir.persistBlocks(src, pendingFile);
}
} finally {
@@ -2481,6 +2932,8 @@
void renewLease(String holder) throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot renew lease for " + holder, safeMode);
}
@@ -2508,6 +2961,8 @@
DirectoryListing dl;
readLock();
try {
+ checkOperation(OperationCategory.READ);
+
if (isPermissionEnabled) {
if (dir.isDir(src)) {
checkPathAccess(src, FsAction.READ_EXECUTE);
@@ -2586,7 +3041,7 @@
* @return an array of datanode commands
* @throws IOException
*/
- DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
+ HeartbeatResponse handleHeartbeat(DatanodeRegistration nodeReg,
long capacity, long dfsUsed, long remaining, long blockPoolUsed,
int xceiverCount, int xmitsInProgress, int failedVolumes)
throws IOException {
@@ -2597,28 +3052,40 @@
DatanodeCommand[] cmds = blockManager.getDatanodeManager().handleHeartbeat(
nodeReg, blockPoolId, capacity, dfsUsed, remaining, blockPoolUsed,
xceiverCount, maxTransfer, failedVolumes);
- if (cmds != null) {
- return cmds;
+ if (cmds == null || cmds.length == 0) {
+ DatanodeCommand cmd = upgradeManager.getBroadcastCommand();
+ if (cmd != null) {
+ cmds = new DatanodeCommand[] {cmd};
+ }
}
-
- //check distributed upgrade
- DatanodeCommand cmd = upgradeManager.getBroadcastCommand();
- if (cmd != null) {
- return new DatanodeCommand[] {cmd};
- }
- return null;
+
+ return new HeartbeatResponse(cmds, createHaStatusHeartbeat());
} finally {
readUnlock();
}
}
+ private NNHAStatusHeartbeat createHaStatusHeartbeat() {
+ HAState state = haContext.getState();
+ NNHAStatusHeartbeat.State hbState;
+ if (state instanceof ActiveState) {
+ hbState = NNHAStatusHeartbeat.State.ACTIVE;
+ } else if (state instanceof StandbyState) {
+ hbState = NNHAStatusHeartbeat.State.STANDBY;
+ } else {
+ throw new AssertionError("Invalid state: " + state.getClass());
+ }
+ return new NNHAStatusHeartbeat(hbState,
+ getFSImage().getLastAppliedOrWrittenTxId());
+ }
+
/**
* Returns whether or not there were available resources at the last check of
* resources.
*
* @return true if there were sufficient resources available, false otherwise.
*/
- private boolean nameNodeHasResourcesAvailable() {
+ boolean nameNodeHasResourcesAvailable() {
return hasResourcesAvailable;
}
@@ -2626,7 +3093,7 @@
* Perform resource checks and cache the results.
* @throws IOException
*/
- private void checkAvailableResources() throws IOException {
+ void checkAvailableResources() {
Preconditions.checkState(nnResourceChecker != null,
"nnResourceChecker not initialized");
hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
@@ -2665,11 +3132,11 @@
}
}
- FSImage getFSImage() {
+ public FSImage getFSImage() {
return dir.fsImage;
}
- FSEditLog getEditLog() {
+ public FSEditLog getEditLog() {
return getFSImage().getEditLog();
}
@@ -2701,8 +3168,12 @@
@Metric({"TransactionsSinceLastLogRoll",
"Number of transactions since last edit log roll"})
public long getTransactionsSinceLastLogRoll() {
- return (getEditLog().getLastWrittenTxId() -
- getEditLog().getCurSegmentTxId()) + 1;
+ if (isInStandbyState()) {
+ return 0;
+ } else {
+ return getEditLog().getLastWrittenTxId() -
+ getEditLog().getCurSegmentTxId() + 1;
+ }
}
@Metric({"LastWrittenTransactionId", "Transaction ID written to the edit log"})
@@ -2931,6 +3402,8 @@
boolean initializedReplQueues = false;
/** Was safemode entered automatically because available resources were low. */
private boolean resourcesLow = false;
+ /** Should safemode adjust its block totals as blocks come in */
+ private boolean shouldIncrementallyTrackBlocks = false;
/**
* Creates SafeModeInfo when the name node enters
@@ -2959,6 +3432,18 @@
}
/**
+ * In the HA case, the StandbyNode can be in safemode while the namespace
+ * is modified by the edit log tailer. In this case, the number of total
+ * blocks changes as edits are processed (eg blocks are added and deleted).
+ * However, we don't want to do the incremental tracking during the
+ * startup-time loading process -- only once the initial total has been
+ * set after the image has been loaded.
+ */
+ private boolean shouldIncrementallyTrackBlocks() {
+ return shouldIncrementallyTrackBlocks;
+ }
+
+ /**
* Creates SafeModeInfo when safe mode is entered manually, or because
* available resources are low.
*
@@ -2986,13 +3471,7 @@
* @return true if in safe mode
*/
private synchronized boolean isOn() {
- try {
- assert isConsistent() : " SafeMode: Inconsistent filesystem state: "
- + "Total num of blocks, active blocks, or "
- + "total safe blocks don't match.";
- } catch(IOException e) {
- System.err.print(StringUtils.stringifyException(e));
- }
+ doConsistencyCheck();
return this.reached >= 0;
}
@@ -3031,8 +3510,9 @@
return;
}
}
- // if not done yet, initialize replication queues
- if (!isPopulatingReplQueues()) {
+ // if not done yet, initialize replication queues.
+ // In the standby, do not populate repl queues
+ if (!isPopulatingReplQueues() && !isInStandbyState()) {
initializeReplQueues();
}
long timeInSafemode = now() - systemStart;
@@ -3051,6 +3531,8 @@
+ nt.getNumOfLeaves() + " datanodes");
NameNode.stateChangeLog.info("STATE* UnderReplicatedBlocks has "
+ blockManager.numOfUnderReplicatedBlocks() + " blocks");
+
+ startSecretManagerIfNecessary();
}
/**
@@ -3073,7 +3555,7 @@
* initializing replication queues.
*/
private synchronized boolean canInitializeReplQueues() {
- return blockSafe >= blockReplQueueThreshold;
+ return !isInStandbyState() && blockSafe >= blockReplQueueThreshold;
}
/**
@@ -3106,6 +3588,9 @@
* Check and trigger safe mode if needed.
*/
private void checkMode() {
+ // Have to have write-lock since leaving safemode initializes
+ // repl queues, which requires write lock
+ assert hasWriteLock();
if (needEnter()) {
enter();
// check if we are ready to initialize replication queues
@@ -3145,6 +3630,13 @@
this.blockThreshold = (int) (blockTotal * threshold);
this.blockReplQueueThreshold =
(int) (blockTotal * replQueueThreshold);
+ if (haEnabled) {
+ // After we initialize the block count, any further namespace
+ // modifications done while in safe mode need to keep track
+ // of the number of total blocks in the system.
+ this.shouldIncrementallyTrackBlocks = true;
+ }
+
checkMode();
}
@@ -3154,9 +3646,10 @@
* @param replication current replication
*/
private synchronized void incrementSafeBlockCount(short replication) {
- if (replication == safeReplication)
+ if (replication == safeReplication) {
this.blockSafe++;
- checkMode();
+ checkMode();
+ }
}
/**
@@ -3165,9 +3658,11 @@
* @param replication current replication
*/
private synchronized void decrementSafeBlockCount(short replication) {
- if (replication == safeReplication-1)
+ if (replication == safeReplication-1) {
this.blockSafe--;
- checkMode();
+ assert blockSafe >= 0 || isManual();
+ checkMode();
+ }
}
/**
@@ -3285,16 +3780,45 @@
/**
* Checks consistency of the class state.
- * This is costly and currently called only in assert.
- * @throws IOException
+ * This is costly so only runs if asserts are enabled.
*/
- private boolean isConsistent() throws IOException {
+ private void doConsistencyCheck() {
+ boolean assertsOn = false;
+ assert assertsOn = true; // set to true if asserts are on
+ if (!assertsOn) return;
+
if (blockTotal == -1 && blockSafe == -1) {
- return true; // manual safe mode
+ return; // manual safe mode
}
int activeBlocks = blockManager.getActiveBlockCount();
- return (blockTotal == activeBlocks) ||
- (blockSafe >= 0 && blockSafe <= blockTotal);
+ if ((blockTotal != activeBlocks) &&
+ !(blockSafe >= 0 && blockSafe <= blockTotal)) {
+ throw new AssertionError(
+ " SafeMode: Inconsistent filesystem state: "
+ + "SafeMode data: blockTotal=" + blockTotal
+ + " blockSafe=" + blockSafe + "; "
+ + "BlockManager data: active=" + activeBlocks);
+ }
+ }
+
+ private synchronized void adjustBlockTotals(int deltaSafe, int deltaTotal) {
+ if (!shouldIncrementallyTrackBlocks) {
+ return;
+ }
+ assert haEnabled;
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Adjusting block totals from " +
+ blockSafe + "/" + blockTotal + " to " +
+ (blockSafe + deltaSafe) + "/" + (blockTotal + deltaTotal));
+ }
+ assert blockSafe + deltaSafe >= 0 : "Can't reduce blockSafe " +
+ blockSafe + " by " + deltaSafe + ": would be negative";
+ assert blockTotal + deltaTotal >= 0 : "Can't reduce blockTotal " +
+ blockTotal + " by " + deltaTotal + ": would be negative";
+
+ blockSafe += deltaSafe;
+ setBlockTotal(blockTotal + deltaTotal);
}
}
@@ -3376,6 +3900,9 @@
@Override
public boolean isPopulatingReplQueues() {
+ if (isInStandbyState()) {
+ return false;
+ }
// safeMode is volatile, and may be set to null at any time
SafeModeInfo safeMode = this.safeMode;
if (safeMode == null)
@@ -3398,13 +3925,30 @@
SafeModeInfo safeMode = this.safeMode;
if (safeMode == null) // mostly true
return;
- safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
+ BlockInfo storedBlock = blockManager.getStoredBlock(b);
+ if (storedBlock.isComplete()) {
+ safeMode.decrementSafeBlockCount((short)blockManager.countNodes(b).liveReplicas());
+ }
+ }
+
+ /**
+ * Adjust the total number of blocks safe and expected during safe mode.
+ * If safe mode is not currently on, this is a no-op.
+ * @param deltaSafe the change in number of safe blocks
+ * @param deltaTotal the change i nnumber of total blocks expected
+ */
+ public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal) {
+ // safeMode is volatile, and may be set to null at any time
+ SafeModeInfo safeMode = this.safeMode;
+ if (safeMode == null)
+ return;
+ safeMode.adjustBlockTotals(deltaSafe, deltaTotal);
}
/**
* Set the total number of blocks in the system.
*/
- void setBlockTotal() {
+ public void setBlockTotal() {
// safeMode is volatile, and may be set to null at any time
SafeModeInfo safeMode = this.safeMode;
if (safeMode == null)
@@ -3440,7 +3984,8 @@
}
assert node != null : "Found a lease for nonexisting file.";
assert node.isUnderConstruction() :
- "Found a lease for file that is not under construction.";
+ "Found a lease for file " + path + " that is not under construction." +
+ " lease=" + lease;
INodeFileUnderConstruction cons = (INodeFileUnderConstruction) node;
BlockInfo[] blocks = cons.getBlocks();
if(blocks == null)
@@ -3465,21 +4010,32 @@
void enterSafeMode(boolean resourcesLow) throws IOException {
writeLock();
try {
- // Ensure that any concurrent operations have been fully synced
- // before entering safe mode. This ensures that the FSImage
- // is entirely stable on disk as soon as we're in safe mode.
- getEditLog().logSyncAll();
- if (!isInSafeMode()) {
- safeMode = new SafeModeInfo(resourcesLow);
- return;
- }
- if (resourcesLow) {
- safeMode.setResourcesLow();
- }
- safeMode.setManual();
- getEditLog().logSyncAll();
- NameNode.stateChangeLog.info("STATE* Safe mode is ON. "
- + safeMode.getTurnOffTip());
+ // Stop the secret manager, since rolling the master key would
+ // try to write to the edit log
+ stopSecretManager();
+
+ // Ensure that any concurrent operations have been fully synced
+ // before entering safe mode. This ensures that the FSImage
+ // is entirely stable on disk as soon as we're in safe mode.
+ boolean isEditlogOpenForWrite = getEditLog().isOpenForWrite();
+ // Before Editlog is in OpenForWrite mode, editLogStream will be null. So,
+ // logSyncAll call can be called only when Edlitlog is in OpenForWrite mode
+ if (isEditlogOpenForWrite) {
+ getEditLog().logSyncAll();
+ }
+ if (!isInSafeMode()) {
+ safeMode = new SafeModeInfo(resourcesLow);
+ return;
+ }
+ if (resourcesLow) {
+ safeMode.setResourcesLow();
+ }
+ safeMode.setManual();
+ if (isEditlogOpenForWrite) {
+ getEditLog().logSyncAll();
+ }
+ NameNode.stateChangeLog.info("STATE* Safe mode is ON. "
+ + safeMode.getTurnOffTip());
} finally {
writeUnlock();
}
@@ -3520,6 +4076,7 @@
CheckpointSignature rollEditLog() throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.JOURNAL);
if (isInSafeMode()) {
throw new SafeModeException("Log not rolled", safeMode);
}
@@ -3536,6 +4093,8 @@
throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.CHECKPOINT);
+
if (isInSafeMode()) {
throw new SafeModeException("Checkpoint not started", safeMode);
}
@@ -3552,6 +4111,8 @@
CheckpointSignature sig) throws IOException {
readLock();
try {
+ checkOperation(OperationCategory.CHECKPOINT);
+
if (isInSafeMode()) {
throw new SafeModeException("Checkpoint not ended", safeMode);
}
@@ -3704,6 +4265,34 @@
return blockManager.getExcessBlocksCount();
}
+ // HA-only metric
+ @Metric
+ public long getPostponedMisreplicatedBlocks() {
+ return blockManager.getPostponedMisreplicatedBlocksCount();
+ }
+
+ // HA-only metric
+ @Metric
+ public int getPendingDataNodeMessageCount() {
+ return blockManager.getPendingDataNodeMessageCount();
+ }
+
+ // HA-only metric
+ @Metric
+ public String getHAState() {
+ return haContext.getState().toString();
+ }
+
+ // HA-only metric
+ @Metric
+ public long getMillisSinceLastLoadedEdits() {
+ if (isInStandbyState() && editLogTailer != null) {
+ return now() - editLogTailer.getLastLoadTimestamp();
+ } else {
+ return 0;
+ }
+ }
+
@Metric
public int getBlockCapacity() {
return blockManager.getCapacity();
@@ -3715,6 +4304,7 @@
}
private ObjectName mbeanName;
+
/**
* Register the FSNamesystem MBean using the name
* "hadoop:service=NameNode,name=FSNamesystemState"
@@ -3813,6 +4403,29 @@
}
/**
+ * Client is reporting some bad block locations.
+ */
+ void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
+ writeLock();
+ try {
+ checkOperation(OperationCategory.WRITE);
+
+ NameNode.stateChangeLog.info("*DIR* NameNode.reportBadBlocks");
+ for (int i = 0; i < blocks.length; i++) {
+ ExtendedBlock blk = blocks[i].getBlock();
+ DatanodeInfo[] nodes = blocks[i].getLocations();
+ for (int j = 0; j < nodes.length; j++) {
+ DatanodeInfo dn = nodes[j];
+ blockManager.findAndMarkBlockAsCorrupt(blk, dn,
+ "client machine reported it");
+ }
+ }
+ } finally {
+ writeUnlock();
+ }
+ }
+
+ /**
* Get a new generation stamp together with an access token for
* a block under construction
*
@@ -3829,6 +4442,8 @@
LocatedBlock locatedBlock;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
// check vadility of parameters
checkUCBlock(block, clientName);
@@ -3858,6 +4473,8 @@
throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Pipeline not updated", safeMode);
}
@@ -3873,7 +4490,7 @@
} finally {
writeUnlock();
}
- if (supportAppends) {
+ if (supportAppends || persistBlocks) {
getEditLog().logSync();
}
LOG.info("updatePipeline(" + oldBlock + ") successfully to " + newBlock);
@@ -4067,6 +4684,8 @@
readLock();
try {
+ checkOperation(OperationCategory.READ);
+
if (!isPopulatingReplQueues()) {
throw new IOException("Cannot run listCorruptFileBlocks because " +
"replication queues have not been initialized.");
@@ -4159,6 +4778,8 @@
Token<DelegationTokenIdentifier> token;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot issue delegation token", safeMode);
}
@@ -4203,6 +4824,8 @@
long expiryTime;
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot renew delegation token", safeMode);
}
@@ -4233,6 +4856,8 @@
throws IOException {
writeLock();
try {
+ checkOperation(OperationCategory.WRITE);
+
if (isInSafeMode()) {
throw new SafeModeException("Cannot cancel delegation token", safeMode);
}
@@ -4266,16 +4891,14 @@
* @param key new delegation key.
*/
public void logUpdateMasterKey(DelegationKey key) throws IOException {
- writeLock();
- try {
- if (isInSafeMode()) {
- throw new SafeModeException(
- "Cannot log master key update in safe mode", safeMode);
- }
- getEditLog().logUpdateMasterKey(key);
- } finally {
- writeUnlock();
- }
+
+ assert !isInSafeMode() :
+ "this should never be called while in safemode, since we stop " +
+ "the DT manager before entering safemode!";
+ // No need to hold FSN lock since we don't access any internal
+ // structures, and this is stopped before the FSN shuts itself
+ // down, etc.
+ getEditLog().logUpdateMasterKey(key);
getEditLog().logSync();
}
@@ -4545,9 +5168,32 @@
byte[] password) throws InvalidToken {
getDelegationTokenSecretManager().verifyToken(identifier, password);
}
+
+ public boolean isGenStampInFuture(long genStamp) {
+ return (genStamp > getGenerationStamp());
+ }
+ @VisibleForTesting
+ public EditLogTailer getEditLogTailer() {
+ return editLogTailer;
+ }
+
+ @VisibleForTesting
+ void setFsLockForTests(ReentrantReadWriteLock lock) {
+ this.fsLock = lock;
+ }
+
+ @VisibleForTesting
+ ReentrantReadWriteLock getFsLockForTests() {
+ return fsLock;
+ }
@VisibleForTesting
public SafeModeInfo getSafeModeInfoForTests() {
return safeMode;
}
+
+ @VisibleForTesting
+ public void setNNResourceChecker(NameNodeResourceChecker nnResourceChecker) {
+ this.nnResourceChecker = nnResourceChecker;
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java
index eeb40c2..603dd00 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java
@@ -52,6 +52,7 @@
private static final Log LOG = LogFactory.getLog(FileJournalManager.class);
private final StorageDirectory sd;
+ private final NNStorage storage;
private int outputBufferCapacity = 512*1024;
private static final Pattern EDITS_REGEX = Pattern.compile(
@@ -60,14 +61,14 @@
NameNodeFile.EDITS_INPROGRESS.getName() + "_(\\d+)");
private File currentInProgress = null;
- private long maxSeenTransaction = 0L;
@VisibleForTesting
StoragePurger purger
= new NNStorageRetentionManager.DeletionStoragePurger();
- public FileJournalManager(StorageDirectory sd) {
+ public FileJournalManager(StorageDirectory sd, NNStorage storage) {
this.sd = sd;
+ this.storage = storage;
}
@Override
@@ -76,11 +77,16 @@
@Override
synchronized public EditLogOutputStream startLogSegment(long txid)
throws IOException {
- currentInProgress = NNStorage.getInProgressEditsFile(sd, txid);
- EditLogOutputStream stm = new EditLogFileOutputStream(currentInProgress,
- outputBufferCapacity);
- stm.create();
- return stm;
+ try {
+ currentInProgress = NNStorage.getInProgressEditsFile(sd, txid);
+ EditLogOutputStream stm = new EditLogFileOutputStream(currentInProgress,
+ outputBufferCapacity);
+ stm.create();
+ return stm;
+ } catch (IOException e) {
+ storage.reportErrorsOnDirectory(sd);
+ throw e;
+ }
}
@Override
@@ -90,13 +96,14 @@
File dstFile = NNStorage.getFinalizedEditsFile(
sd, firstTxId, lastTxId);
- LOG.debug("Finalizing edits file " + inprogressFile + " -> " + dstFile);
+ LOG.info("Finalizing edits file " + inprogressFile + " -> " + dstFile);
Preconditions.checkState(!dstFile.exists(),
"Can't finalize edits file " + inprogressFile + " since finalized file " +
"already exists");
if (!inprogressFile.renameTo(dstFile)) {
- throw new IOException("Unable to finalize edits file " + inprogressFile);
+ storage.reportErrorsOnDirectory(sd);
+ throw new IllegalStateException("Unable to finalize edits file " + inprogressFile);
}
if (inprogressFile.equals(currentInProgress)) {
currentInProgress = null;
@@ -116,6 +123,7 @@
@Override
public void purgeLogsOlderThan(long minTxIdToKeep)
throws IOException {
+ LOG.info("Purging logs older than " + minTxIdToKeep);
File[] files = FileUtil.listFiles(sd.getCurrentDir());
List<EditLogFile> editLogs =
FileJournalManager.matchEditLogs(files);
@@ -135,18 +143,18 @@
*/
List<RemoteEditLog> getRemoteEditLogs(long firstTxId) throws IOException {
File currentDir = sd.getCurrentDir();
- List<EditLogFile> allLogFiles = matchEditLogs(
- FileUtil.listFiles(currentDir));
+ List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
List<RemoteEditLog> ret = Lists.newArrayListWithCapacity(
allLogFiles.size());
for (EditLogFile elf : allLogFiles) {
- if (elf.isCorrupt() || elf.isInProgress()) continue;
+ if (elf.hasCorruptHeader() || elf.isInProgress()) continue;
if (elf.getFirstTxId() >= firstTxId) {
ret.add(new RemoteEditLog(elf.firstTxId, elf.lastTxId));
} else if ((firstTxId > elf.getFirstTxId()) &&
(firstTxId <= elf.getLastTxId())) {
- throw new IOException("Asked for firstTxId " + firstTxId
+ // Note that this behavior is different from getLogFiles below.
+ throw new IllegalStateException("Asked for firstTxId " + firstTxId
+ " which is in the middle of file " + elf.file);
}
}
@@ -154,6 +162,20 @@
return ret;
}
+ /**
+ * returns matching edit logs via the log directory. Simple helper function
+ * that lists the files in the logDir and calls matchEditLogs(File[])
+ *
+ * @param logDir
+ * directory to match edit logs in
+ * @return matched edit logs
+ * @throws IOException
+ * IOException thrown for invalid logDir
+ */
+ static List<EditLogFile> matchEditLogs(File logDir) throws IOException {
+ return matchEditLogs(FileUtil.listFiles(logDir));
+ }
+
static List<EditLogFile> matchEditLogs(File[] filesInStorage) {
List<EditLogFile> ret = Lists.newArrayList();
for (File f : filesInStorage) {
@@ -169,7 +191,7 @@
LOG.error("Edits file " + f + " has improperly formatted " +
"transaction ID");
// skip
- }
+ }
}
// Check for in-progress edits
@@ -190,27 +212,37 @@
}
@Override
- synchronized public EditLogInputStream getInputStream(long fromTxId)
- throws IOException {
+ synchronized public EditLogInputStream getInputStream(long fromTxId,
+ boolean inProgressOk) throws IOException {
for (EditLogFile elf : getLogFiles(fromTxId)) {
- if (elf.getFirstTxId() == fromTxId) {
+ if (elf.containsTxId(fromTxId)) {
+ if (!inProgressOk && elf.isInProgress()) {
+ continue;
+ }
if (elf.isInProgress()) {
elf.validateLog();
}
if (LOG.isTraceEnabled()) {
LOG.trace("Returning edit stream reading from " + elf);
}
- return new EditLogFileInputStream(elf.getFile(),
- elf.getFirstTxId(), elf.getLastTxId());
+ EditLogFileInputStream elfis = new EditLogFileInputStream(elf.getFile(),
+ elf.getFirstTxId(), elf.getLastTxId(), elf.isInProgress());
+ long transactionsToSkip = fromTxId - elf.getFirstTxId();
+ if (transactionsToSkip > 0) {
+ LOG.info(String.format("Log begins at txid %d, but requested start "
+ + "txid is %d. Skipping %d edits.", elf.getFirstTxId(), fromTxId,
+ transactionsToSkip));
+ elfis.skipTransactions(transactionsToSkip);
+ }
+ return elfis;
}
}
- throw new IOException("Cannot find editlog file with " + fromTxId
- + " as first first txid");
+ throw new IOException("Cannot find editlog file containing " + fromTxId);
}
@Override
- public long getNumberOfTransactions(long fromTxId)
+ public long getNumberOfTransactions(long fromTxId, boolean inProgressOk)
throws IOException, CorruptionException {
long numTxns = 0L;
@@ -222,21 +254,25 @@
LOG.warn("Gap in transactions in " + sd.getRoot() + ". Gap is "
+ fromTxId + " - " + (elf.getFirstTxId() - 1));
break;
- } else if (fromTxId == elf.getFirstTxId()) {
+ } else if (elf.containsTxId(fromTxId)) {
+ if (!inProgressOk && elf.isInProgress()) {
+ break;
+ }
+
if (elf.isInProgress()) {
elf.validateLog();
}
- if (elf.isCorrupt()) {
+ if (elf.hasCorruptHeader()) {
break;
}
+ numTxns += elf.getLastTxId() + 1 - fromTxId;
fromTxId = elf.getLastTxId() + 1;
- numTxns += fromTxId - elf.getFirstTxId();
if (elf.isInProgress()) {
break;
}
- } // else skip
+ }
}
if (LOG.isDebugEnabled()) {
@@ -244,7 +280,8 @@
+ " txns from " + fromTxId);
}
- long max = findMaxTransaction();
+ long max = findMaxTransaction(inProgressOk);
+
// fromTxId should be greater than max, as it points to the next
// transaction we should expect to find. If it is less than or equal
// to max, it means that a transaction with txid == max has not been found
@@ -261,23 +298,44 @@
@Override
synchronized public void recoverUnfinalizedSegments() throws IOException {
File currentDir = sd.getCurrentDir();
- List<EditLogFile> allLogFiles = matchEditLogs(currentDir.listFiles());
-
- // make sure journal is aware of max seen transaction before moving corrupt
- // files aside
- findMaxTransaction();
+ LOG.info("Recovering unfinalized segments in " + currentDir);
+ List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
for (EditLogFile elf : allLogFiles) {
if (elf.getFile().equals(currentInProgress)) {
continue;
}
if (elf.isInProgress()) {
- elf.validateLog();
-
- if (elf.isCorrupt()) {
- elf.moveAsideCorruptFile();
+ // If the file is zero-length, we likely just crashed after opening the
+ // file, but before writing anything to it. Safe to delete it.
+ if (elf.getFile().length() == 0) {
+ LOG.info("Deleting zero-length edit log file " + elf);
+ if (!elf.getFile().delete()) {
+ throw new IOException("Unable to delete file " + elf.getFile());
+ }
continue;
}
+
+ elf.validateLog();
+
+ if (elf.hasCorruptHeader()) {
+ elf.moveAsideCorruptFile();
+ throw new CorruptionException("In-progress edit log file is corrupt: "
+ + elf);
+ }
+
+ // If the file has a valid header (isn't corrupt) but contains no
+ // transactions, we likely just crashed after opening the file and
+ // writing the header, but before syncing any transactions. Safe to
+ // delete the file.
+ if (elf.getNumTransactions() == 0) {
+ LOG.info("Deleting edit log file with zero transactions " + elf);
+ if (!elf.getFile().delete()) {
+ throw new IOException("Unable to delete " + elf.getFile());
+ }
+ continue;
+ }
+
finalizeLogSegment(elf.getFirstTxId(), elf.getLastTxId());
}
}
@@ -285,16 +343,12 @@
private List<EditLogFile> getLogFiles(long fromTxId) throws IOException {
File currentDir = sd.getCurrentDir();
- List<EditLogFile> allLogFiles = matchEditLogs(currentDir.listFiles());
+ List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
List<EditLogFile> logFiles = Lists.newArrayList();
for (EditLogFile elf : allLogFiles) {
- if (fromTxId > elf.getFirstTxId()
- && fromTxId <= elf.getLastTxId()) {
- throw new IOException("Asked for fromTxId " + fromTxId
- + " which is in middle of file " + elf.file);
- }
- if (fromTxId <= elf.getFirstTxId()) {
+ if (fromTxId <= elf.getFirstTxId() ||
+ elf.containsTxId(fromTxId)) {
logFiles.add(elf);
}
}
@@ -306,21 +360,35 @@
/**
* Find the maximum transaction in the journal.
- * This gets stored in a member variable, as corrupt edit logs
- * will be moved aside, but we still need to remember their first
- * tranaction id in the case that it was the maximum transaction in
- * the journal.
*/
- private long findMaxTransaction()
+ private long findMaxTransaction(boolean inProgressOk)
throws IOException {
+ boolean considerSeenTxId = true;
+ long seenTxId = NNStorage.readTransactionIdFile(sd);
+ long maxSeenTransaction = 0;
for (EditLogFile elf : getLogFiles(0)) {
+ if (elf.isInProgress() && !inProgressOk) {
+ if (elf.getFirstTxId() != HdfsConstants.INVALID_TXID &&
+ elf.getFirstTxId() <= seenTxId) {
+ // don't look at the seen_txid file if in-progress logs are not to be
+ // examined, and the value in seen_txid falls within the in-progress
+ // segment.
+ considerSeenTxId = false;
+ }
+ continue;
+ }
+
if (elf.isInProgress()) {
maxSeenTransaction = Math.max(elf.getFirstTxId(), maxSeenTransaction);
elf.validateLog();
}
maxSeenTransaction = Math.max(elf.getLastTxId(), maxSeenTransaction);
}
- return maxSeenTransaction;
+ if (considerSeenTxId) {
+ return Math.max(maxSeenTransaction, seenTxId);
+ } else {
+ return maxSeenTransaction;
+ }
}
@Override
@@ -335,8 +403,9 @@
private File file;
private final long firstTxId;
private long lastTxId;
+ private long numTx = -1;
- private boolean isCorrupt = false;
+ private boolean hasCorruptHeader = false;
private final boolean isInProgress;
final static Comparator<EditLogFile> COMPARE_BY_START_TXID
@@ -376,6 +445,10 @@
long getLastTxId() {
return lastTxId;
}
+
+ boolean containsTxId(long txId) {
+ return firstTxId <= txId && txId <= lastTxId;
+ }
/**
* Count the number of valid transactions in a log.
@@ -384,11 +457,13 @@
*/
void validateLog() throws IOException {
EditLogValidation val = EditLogFileInputStream.validateEditLog(file);
- if (val.getNumTransactions() == 0) {
- markCorrupt();
- } else {
- this.lastTxId = val.getEndTxId();
- }
+ this.numTx = val.getNumTransactions();
+ this.lastTxId = val.getEndTxId();
+ this.hasCorruptHeader = val.hasCorruptHeader();
+ }
+
+ long getNumTransactions() {
+ return numTx;
}
boolean isInProgress() {
@@ -399,16 +474,12 @@
return file;
}
- void markCorrupt() {
- isCorrupt = true;
- }
-
- boolean isCorrupt() {
- return isCorrupt;
+ boolean hasCorruptHeader() {
+ return hasCorruptHeader;
}
void moveAsideCorruptFile() throws IOException {
- assert isCorrupt;
+ assert hasCorruptHeader;
File src = file;
File dst = new File(src.getParent(), src.getName() + ".corrupt");
@@ -423,8 +494,9 @@
@Override
public String toString() {
return String.format("EditLogFile(file=%s,first=%019d,last=%019d,"
- +"inProgress=%b,corrupt=%b)", file.toString(),
- firstTxId, lastTxId, isInProgress(), isCorrupt);
+ +"inProgress=%b,hasCorruptHeader=%b,numTx=%d)",
+ file.toString(), firstTxId, lastTxId,
+ isInProgress(), hasCorruptHeader, numTx);
}
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
index 8753b27..b986003 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
@@ -124,16 +124,18 @@
final long txid = parsedParams.getTxId();
if (! currentlyDownloadingCheckpoints.add(txid)) {
- throw new IOException(
+ response.sendError(HttpServletResponse.SC_CONFLICT,
"Another checkpointer is already in the process of uploading a" +
" checkpoint made at transaction ID " + txid);
+ return null;
}
try {
if (nnImage.getStorage().findImageFile(txid) != null) {
- throw new IOException(
+ response.sendError(HttpServletResponse.SC_CONFLICT,
"Another checkpointer already uploaded an checkpoint " +
"for txid " + txid);
+ return null;
}
// issue a HTTP get request to download the new fsimage
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java
index 0fab53c..c5c47fd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeFileUnderConstruction.java
@@ -26,6 +26,8 @@
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
+import com.google.common.base.Joiner;
+
/**
* I-node for file being written.
*/
@@ -41,19 +43,7 @@
String clientName,
String clientMachine,
DatanodeDescriptor clientNode) {
- this(permissions, 0, replication, preferredBlockSize, modTime,
- clientName, clientMachine, clientNode);
- }
-
- INodeFileUnderConstruction(PermissionStatus permissions,
- int nrBlocks,
- short replication,
- long preferredBlockSize,
- long modTime,
- String clientName,
- String clientMachine,
- DatanodeDescriptor clientNode) {
- super(permissions.applyUMask(UMASK), nrBlocks, replication,
+ super(permissions.applyUMask(UMASK), 0, replication,
modTime, modTime, preferredBlockSize);
this.clientName = clientName;
this.clientMachine = clientMachine;
@@ -106,6 +96,9 @@
// use the modification time as the access time
//
INodeFile convertToInodeFile() {
+ assert allBlocksComplete() :
+ "Can't finalize inode " + this + " since it contains " +
+ "non-complete blocks! Blocks are: " + blocksAsString();
INodeFile obj = new INodeFile(getPermissionStatus(),
getBlocks(),
getReplication(),
@@ -115,6 +108,18 @@
return obj;
}
+
+ /**
+ * @return true if all of the blocks in this file are marked as completed.
+ */
+ private boolean allBlocksComplete() {
+ for (BlockInfo b : blocks) {
+ if (!b.isComplete()) {
+ return false;
+ }
+ }
+ return true;
+ }
/**
* Remove a block from the block list. This block should be
@@ -153,4 +158,8 @@
setBlock(numBlocks()-1, ucBlock);
return ucBlock;
}
+
+ private String blocksAsString() {
+ return Joiner.on(",").join(this.blocks);
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java
index d45de18..f9c622d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java
@@ -48,20 +48,23 @@
/**
* Get the input stream starting with fromTxnId from this journal manager
* @param fromTxnId the first transaction id we want to read
+ * @param inProgressOk whether or not in-progress streams should be returned
* @return the stream starting with transaction fromTxnId
* @throws IOException if a stream cannot be found.
*/
- EditLogInputStream getInputStream(long fromTxnId) throws IOException;
+ EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk)
+ throws IOException;
/**
* Get the number of transaction contiguously available from fromTxnId.
*
* @param fromTxnId Transaction id to count from
+ * @param inProgressOk whether or not in-progress streams should be counted
* @return The number of transactions available from fromTxnId
* @throws IOException if the journal cannot be read.
* @throws CorruptionException if there is a gap in the journal at fromTxnId.
*/
- long getNumberOfTransactions(long fromTxnId)
+ long getNumberOfTransactions(long fromTxnId, boolean inProgressOk)
throws IOException, CorruptionException;
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java
index b1accd8..d84d79d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java
@@ -25,8 +25,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
+
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
@@ -35,8 +37,6 @@
import com.google.common.collect.Multimaps;
import com.google.common.collect.Sets;
-import org.apache.hadoop.classification.InterfaceAudience;
-
/**
* Manages a collection of Journals. None of the methods are synchronized, it is
* assumed that FSEditLog methods, that use this class, use proper
@@ -148,11 +148,17 @@
private List<JournalAndStream> journals = Lists.newArrayList();
final int minimumRedundantJournals;
+ private volatile Runtime runtime = Runtime.getRuntime();
JournalSet(int minimumRedundantResources) {
this.minimumRedundantJournals = minimumRedundantResources;
}
+ @VisibleForTesting
+ public void setRuntimeForTesting(Runtime runtime) {
+ this.runtime = runtime;
+ }
+
@Override
public EditLogOutputStream startLogSegment(final long txId) throws IOException {
mapJournalsAndReportErrors(new JournalClosure() {
@@ -201,19 +207,25 @@
* or null if no more exist
*/
@Override
- public EditLogInputStream getInputStream(long fromTxnId) throws IOException {
+ public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk)
+ throws IOException {
JournalManager bestjm = null;
long bestjmNumTxns = 0;
CorruptionException corruption = null;
for (JournalAndStream jas : journals) {
+ if (jas.isDisabled()) continue;
+
JournalManager candidate = jas.getManager();
long candidateNumTxns = 0;
try {
- candidateNumTxns = candidate.getNumberOfTransactions(fromTxnId);
+ candidateNumTxns = candidate.getNumberOfTransactions(fromTxnId,
+ inProgressOk);
} catch (CorruptionException ce) {
corruption = ce;
} catch (IOException ioe) {
+ LOG.warn("Unable to read input streams from JournalManager " + candidate,
+ ioe);
continue; // error reading disk, just skip
}
@@ -231,15 +243,20 @@
return null;
}
}
- return bestjm.getInputStream(fromTxnId);
+ return bestjm.getInputStream(fromTxnId, inProgressOk);
}
@Override
- public long getNumberOfTransactions(long fromTxnId) throws IOException {
+ public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk)
+ throws IOException {
long num = 0;
for (JournalAndStream jas: journals) {
- if (jas.isActive()) {
- long newNum = jas.getManager().getNumberOfTransactions(fromTxnId);
+ if (jas.isDisabled()) {
+ LOG.info("Skipping jas " + jas + " since it's disabled");
+ continue;
+ } else {
+ long newNum = jas.getManager().getNumberOfTransactions(fromTxnId,
+ inProgressOk);
if (newNum > num) {
num = newNum;
}
@@ -298,13 +315,31 @@
*/
private void mapJournalsAndReportErrors(
JournalClosure closure, String status) throws IOException{
+
List<JournalAndStream> badJAS = Lists.newLinkedList();
for (JournalAndStream jas : journals) {
try {
closure.apply(jas);
} catch (Throwable t) {
- LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
- badJAS.add(jas);
+ if (jas.isRequired()) {
+ String msg = "Error: " + status + " failed for required journal ("
+ + jas + ")";
+ LOG.fatal(msg, t);
+ // If we fail on *any* of the required journals, then we must not
+ // continue on any of the other journals. Abort them to ensure that
+ // retry behavior doesn't allow them to keep going in any way.
+ abortAllJournals();
+ // the current policy is to shutdown the NN on errors to shared edits
+ // dir. There are many code paths to shared edits failures - syncs,
+ // roll of edits etc. All of them go through this common function
+ // where the isRequired() check is made. Applying exit policy here
+ // to catch all code paths.
+ runtime.exit(1);
+ throw new IOException(msg);
+ } else {
+ LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
+ badJAS.add(jas);
+ }
}
}
disableAndReportErrorOnJournals(badJAS);
@@ -317,6 +352,17 @@
}
/**
+ * Abort all of the underlying streams.
+ */
+ private void abortAllJournals() {
+ for (JournalAndStream jas : journals) {
+ if (jas.isActive()) {
+ jas.abort();
+ }
+ }
+ }
+
+ /**
* An implementation of EditLogOutputStream that applies a requested method on
* all the journals that are currently active.
*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java
index 6f03452..71e6cbb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java
@@ -34,6 +34,10 @@
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
+import org.apache.hadoop.util.Daemon;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
import static org.apache.hadoop.hdfs.server.common.Util.now;
@@ -82,6 +86,9 @@
//
private SortedMap<String, Lease> sortedLeasesByPath = new TreeMap<String, Lease>();
+ private Daemon lmthread;
+ private volatile boolean shouldRunMonitor;
+
LeaseManager(FSNamesystem fsnamesystem) {this.fsnamesystem = fsnamesystem;}
Lease getLease(String holder) {
@@ -146,6 +153,9 @@
Lease lease = getLease(holder);
if (lease != null) {
removeLease(lease, src);
+ } else {
+ LOG.warn("Removing non-existent lease! holder=" + holder +
+ " src=" + src);
}
}
@@ -190,6 +200,15 @@
}
}
+ /**
+ * Renew all of the currently open leases.
+ */
+ synchronized void renewAllLeases() {
+ for (Lease l : leases.values()) {
+ renewLease(l);
+ }
+ }
+
/************************************************************
* A Lease governs all the locks held by a single client.
* For each client there's a corresponding lease, whose
@@ -296,6 +315,11 @@
paths.remove(oldpath);
paths.add(newpath);
}
+
+ @VisibleForTesting
+ long getLastUpdate() {
+ return lastUpdate;
+ }
}
synchronized void changeLease(String src, String dst,
@@ -367,18 +391,18 @@
/** Check leases periodically. */
public void run() {
- for(; fsnamesystem.isRunning(); ) {
- fsnamesystem.writeLock();
+ for(; shouldRunMonitor && fsnamesystem.isRunning(); ) {
try {
- if (!fsnamesystem.isInSafeMode()) {
- checkLeases();
+ fsnamesystem.writeLockInterruptibly();
+ try {
+ if (!fsnamesystem.isInSafeMode()) {
+ checkLeases();
+ }
+ } finally {
+ fsnamesystem.writeUnlock();
}
- } finally {
- fsnamesystem.writeUnlock();
- }
-
-
- try {
+
+
Thread.sleep(HdfsServerConstants.NAMENODE_LEASE_RECHECK_INTERVAL);
} catch(InterruptedException ie) {
if (LOG.isDebugEnabled()) {
@@ -437,4 +461,36 @@
+ "\n sortedLeasesByPath=" + sortedLeasesByPath
+ "\n}";
}
+
+ void startMonitor() {
+ Preconditions.checkState(lmthread == null,
+ "Lease Monitor already running");
+ shouldRunMonitor = true;
+ lmthread = new Daemon(new Monitor());
+ lmthread.start();
+ }
+
+ void stopMonitor() {
+ if (lmthread != null) {
+ shouldRunMonitor = false;
+ try {
+ lmthread.interrupt();
+ lmthread.join(3000);
+ } catch (InterruptedException ie) {
+ LOG.warn("Encountered exception ", ie);
+ }
+ lmthread = null;
+ }
+ }
+
+ /**
+ * Trigger the currently-running Lease monitor to re-check
+ * its leases immediately. This is for use by unit tests.
+ */
+ @VisibleForTesting
+ void triggerMonitorCheckNow() {
+ Preconditions.checkState(lmthread != null,
+ "Lease monitor is not running");
+ lmthread.interrupt();
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
index 7bca8f4..3f157e0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
@@ -28,6 +28,7 @@
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@@ -161,7 +162,8 @@
// this may modify the editsDirs, so copy before passing in
setStorageDirectories(imageDirs,
- Lists.newArrayList(editsDirs));
+ Lists.newArrayList(editsDirs),
+ FSNamesystem.getSharedEditsDirs(conf));
}
@Override // Storage
@@ -249,6 +251,16 @@
List<StorageDirectory> getRemovedStorageDirs() {
return this.removedStorageDirs;
}
+
+ /**
+ * See {@link NNStorage#setStorageDirectories(Collection, Collection, Collection)}
+ */
+ @VisibleForTesting
+ synchronized void setStorageDirectories(Collection<URI> fsNameDirs,
+ Collection<URI> fsEditsDirs)
+ throws IOException {
+ setStorageDirectories(fsNameDirs, fsEditsDirs, new ArrayList<URI>());
+ }
/**
* Set the storage directories which will be used. This should only ever be
@@ -265,7 +277,8 @@
*/
@VisibleForTesting
synchronized void setStorageDirectories(Collection<URI> fsNameDirs,
- Collection<URI> fsEditsDirs)
+ Collection<URI> fsEditsDirs,
+ Collection<URI> sharedEditsDirs)
throws IOException {
this.storageDirs.clear();
this.removedStorageDirs.clear();
@@ -289,7 +302,8 @@
if(dirName.getScheme().compareTo(JournalType.FILE.name().toLowerCase())
== 0){
this.addStorageDir(new StorageDirectory(new File(dirName.getPath()),
- dirType));
+ dirType,
+ !sharedEditsDirs.contains(dirName))); // Don't lock the dir if it's shared.
}
}
@@ -301,7 +315,7 @@
if(dirName.getScheme().compareTo(JournalType.FILE.name().toLowerCase())
== 0)
this.addStorageDir(new StorageDirectory(new File(dirName.getPath()),
- NameNodeDirType.EDITS));
+ NameNodeDirType.EDITS, !sharedEditsDirs.contains(dirName)));
}
}
@@ -458,7 +472,7 @@
/**
* @return the transaction ID of the last checkpoint.
*/
- long getMostRecentCheckpointTxId() {
+ public long getMostRecentCheckpointTxId() {
return mostRecentCheckpointTxId;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java
index fe65100..fe75247 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java
@@ -31,6 +31,7 @@
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
import org.apache.hadoop.hdfs.util.MD5FileUtils;
+import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
@@ -46,6 +47,7 @@
public class NNStorageRetentionManager {
private final int numCheckpointsToRetain;
+ private final long numExtraEditsToRetain;
private static final Log LOG = LogFactory.getLog(
NNStorageRetentionManager.class);
private final NNStorage storage;
@@ -60,6 +62,15 @@
this.numCheckpointsToRetain = conf.getInt(
DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY,
DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT);
+ this.numExtraEditsToRetain = conf.getLong(
+ DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY,
+ DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_DEFAULT);
+ Preconditions.checkArgument(numCheckpointsToRetain > 0,
+ "Must retain at least one checkpoint");
+ Preconditions.checkArgument(numExtraEditsToRetain >= 0,
+ DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY +
+ " must not be negative");
+
this.storage = storage;
this.editLog = editLog;
this.purger = purger;
@@ -79,8 +90,12 @@
purgeCheckpointsOlderThan(inspector, minImageTxId);
// If fsimage_N is the image we want to keep, then we need to keep
// all txns > N. We can remove anything < N+1, since fsimage_N
- // reflects the state up to and including N.
- editLog.purgeLogsOlderThan(minImageTxId + 1);
+ // reflects the state up to and including N. However, we also
+ // provide a "cushion" of older txns that we keep, which is
+ // handy for HA, where a remote node may not have as many
+ // new images.
+ long purgeLogsFrom = Math.max(0, minImageTxId + 1 - numExtraEditsToRetain);
+ editLog.purgeLogsOlderThan(purgeLogsFrom);
}
private void purgeCheckpointsOlderThan(
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
index a0f4d4b..b62f0d5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@@ -17,9 +17,6 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
-import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
-
import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
@@ -33,22 +30,40 @@
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.HealthCheckFailedException;
+import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Trash;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.namenode.ha.ActiveState;
+import org.apache.hadoop.hdfs.server.namenode.ha.HAContext;
+import org.apache.hadoop.hdfs.server.namenode.ha.HAState;
+import org.apache.hadoop.hdfs.server.namenode.ha.StandbyState;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
+import org.apache.hadoop.ipc.StandbyException;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.security.RefreshUserMappingsProtocol;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
+import org.apache.hadoop.tools.GetUserMappingsProtocol;
import org.apache.hadoop.util.ServicePlugin;
import org.apache.hadoop.util.StringUtils;
@@ -96,6 +111,22 @@
}
/**
+ * Categories of operations supported by the namenode.
+ */
+ public static enum OperationCategory {
+ /** Operations that are state agnostic */
+ UNCHECKED,
+ /** Read operation that does not change the namespace state */
+ READ,
+ /** Write operation that changes the namespace state */
+ WRITE,
+ /** Operations related to checkpointing */
+ CHECKPOINT,
+ /** Operations related to {@link JournalProtocol} */
+ JOURNAL
+ }
+
+ /**
* HDFS federation configuration can have two types of parameters:
* <ol>
* <li>Parameter that is common for all the name services in the cluster.</li>
@@ -110,6 +141,7 @@
DFS_NAMENODE_RPC_ADDRESS_KEY,
DFS_NAMENODE_NAME_DIR_KEY,
DFS_NAMENODE_EDITS_DIR_KEY,
+ DFS_NAMENODE_SHARED_EDITS_DIR_KEY,
DFS_NAMENODE_CHECKPOINT_DIR_KEY,
DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY,
DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
@@ -124,14 +156,40 @@
DFS_NAMENODE_BACKUP_SERVICE_RPC_ADDRESS_KEY
};
-
+ public long getProtocolVersion(String protocol,
+ long clientVersion) throws IOException {
+ if (protocol.equals(ClientProtocol.class.getName())) {
+ return ClientProtocol.versionID;
+ } else if (protocol.equals(DatanodeProtocol.class.getName())){
+ return DatanodeProtocol.versionID;
+ } else if (protocol.equals(NamenodeProtocol.class.getName())){
+ return NamenodeProtocol.versionID;
+ } else if (protocol.equals(RefreshAuthorizationPolicyProtocol.class.getName())){
+ return RefreshAuthorizationPolicyProtocol.versionID;
+ } else if (protocol.equals(RefreshUserMappingsProtocol.class.getName())){
+ return RefreshUserMappingsProtocol.versionID;
+ } else if (protocol.equals(GetUserMappingsProtocol.class.getName())){
+ return GetUserMappingsProtocol.versionID;
+ } else {
+ throw new IOException("Unknown protocol to name node: " + protocol);
+ }
+ }
+
public static final int DEFAULT_PORT = 8020;
-
public static final Log LOG = LogFactory.getLog(NameNode.class.getName());
public static final Log stateChangeLog = LogFactory.getLog("org.apache.hadoop.hdfs.StateChange");
+ public static final HAState ACTIVE_STATE = new ActiveState();
+ public static final HAState STANDBY_STATE = new StandbyState();
protected FSNamesystem namesystem;
+ protected final Configuration conf;
protected NamenodeRole role;
+ private HAState state;
+ private final boolean haEnabled;
+ private final HAContext haContext;
+ protected boolean allowStaleStandbyReads;
+
+
/** httpServer */
protected NameNodeHttpServer httpServer;
private Thread emptier;
@@ -212,7 +270,7 @@
* @param filesystemURI
* @return address of file system
*/
- static InetSocketAddress getAddress(URI filesystemURI) {
+ public static InetSocketAddress getAddress(URI filesystemURI) {
String authority = filesystemURI.getAuthority();
if (authority == null) {
throw new IllegalArgumentException(String.format(
@@ -251,13 +309,11 @@
* Given a configuration get the address of the service rpc server
* If the service rpc is not configured returns null
*/
- protected InetSocketAddress getServiceRpcServerAddress(Configuration conf)
- throws IOException {
+ protected InetSocketAddress getServiceRpcServerAddress(Configuration conf) {
return NameNode.getServiceAddress(conf, false);
}
- protected InetSocketAddress getRpcServerAddress(Configuration conf)
- throws IOException {
+ protected InetSocketAddress getRpcServerAddress(Configuration conf) {
return getAddress(conf);
}
@@ -335,11 +391,7 @@
throw e;
}
- activate(conf);
- LOG.info(getRole() + " up at: " + rpcServer.getRpcAddress());
- if (rpcServer.getServiceRpcAddress() != null) {
- LOG.info(getRole() + " service server is up at: " + rpcServer.getServiceRpcAddress());
- }
+ startCommonServices(conf);
}
/**
@@ -373,19 +425,11 @@
}
}
- /**
- * Activate name-node servers and threads.
- */
- void activate(Configuration conf) throws IOException {
- if ((isRole(NamenodeRole.NAMENODE))
- && (UserGroupInformation.isSecurityEnabled())) {
- namesystem.activateSecretManager();
- }
- namesystem.activate(conf);
+ /** Start the services common to active and standby states */
+ private void startCommonServices(Configuration conf) throws IOException {
+ namesystem.startCommonServices(conf, haContext);
startHttpServer(conf);
rpcServer.start();
- startTrashEmptier(conf);
-
plugins = conf.getInstances(DFS_NAMENODE_PLUGINS_KEY,
ServicePlugin.class);
for (ServicePlugin p: plugins) {
@@ -395,8 +439,28 @@
LOG.warn("ServicePlugin " + p + " could not be started", t);
}
}
+ LOG.info(getRole() + " up at: " + rpcServer.getRpcAddress());
+ if (rpcServer.getServiceRpcAddress() != null) {
+ LOG.info(getRole() + " service server is up at: "
+ + rpcServer.getServiceRpcAddress());
+ }
}
-
+
+ private void stopCommonServices() {
+ if(namesystem != null) namesystem.close();
+ if(rpcServer != null) rpcServer.stop();
+ if (plugins != null) {
+ for (ServicePlugin p : plugins) {
+ try {
+ p.stop();
+ } catch (Throwable t) {
+ LOG.warn("ServicePlugin " + p + " could not be stopped", t);
+ }
+ }
+ }
+ stopHttpServer();
+ }
+
private void startTrashEmptier(Configuration conf) throws IOException {
long trashInterval
= conf.getLong(CommonConfigurationKeys.FS_TRASH_INTERVAL_KEY,
@@ -408,11 +472,26 @@
this.emptier.start();
}
+ private void stopTrashEmptier() {
+ if (this.emptier != null) {
+ emptier.interrupt();
+ emptier = null;
+ }
+ }
+
private void startHttpServer(final Configuration conf) throws IOException {
httpServer = new NameNodeHttpServer(conf, this, getHttpServerAddress(conf));
httpServer.start();
setHttpServerAddress(conf);
}
+
+ private void stopHttpServer() {
+ try {
+ if (httpServer != null) httpServer.stop();
+ } catch (Exception e) {
+ LOG.error("Exception while stopping httpserver", e);
+ }
+ }
/**
* Start NameNode.
@@ -447,10 +526,23 @@
protected NameNode(Configuration conf, NamenodeRole role)
throws IOException {
+ this.conf = conf;
this.role = role;
+ String nsId = getNameServiceId(conf);
+ String namenodeId = HAUtil.getNameNodeId(conf, nsId);
+ this.haEnabled = HAUtil.isHAEnabled(conf, nsId);
+ if (!haEnabled) {
+ state = ACTIVE_STATE;
+ } else {
+ state = STANDBY_STATE;
+ }
+ this.allowStaleStandbyReads = HAUtil.shouldAllowStandbyReads(conf);
+ this.haContext = createHAContext();
try {
- initializeGenericKeys(conf, getNameServiceId(conf));
+ initializeGenericKeys(conf, nsId, namenodeId);
initialize(conf);
+ state.prepareToEnterState(haContext);
+ state.enterState(haContext);
} catch (IOException e) {
this.stop();
throw e;
@@ -460,6 +552,10 @@
}
}
+ protected HAContext createHAContext() {
+ return new NameNodeHAContext();
+ }
+
/**
* Wait for service to finish.
* (Normally, it runs forever.)
@@ -468,6 +564,7 @@
try {
this.rpcServer.join();
} catch (InterruptedException ie) {
+ LOG.info("Caught interrupted exception ", ie);
}
}
@@ -480,23 +577,14 @@
return;
stopRequested = true;
}
- if (plugins != null) {
- for (ServicePlugin p : plugins) {
- try {
- p.stop();
- } catch (Throwable t) {
- LOG.warn("ServicePlugin " + p + " could not be stopped", t);
- }
- }
- }
try {
- if (httpServer != null) httpServer.stop();
- } catch (Exception e) {
- LOG.error("Exception while stopping httpserver", e);
+ if (state != null) {
+ state.exitState(haContext);
+ }
+ } catch (ServiceFailedException e) {
+ LOG.warn("Encountered exception while exiting state ", e);
}
- if(namesystem != null) namesystem.close();
- if(emptier != null) emptier.interrupt();
- if(rpcServer != null) rpcServer.stop();
+ stopCommonServices();
if (metrics != null) {
metrics.shutdown();
}
@@ -561,6 +649,10 @@
private static boolean format(Configuration conf,
boolean isConfirmationNeeded)
throws IOException {
+ String nsId = DFSUtil.getNamenodeNameServiceId(conf);
+ String namenodeId = HAUtil.getNameNodeId(conf, nsId);
+ initializeGenericKeys(conf, nsId, namenodeId);
+
if (!conf.getBoolean(DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY,
DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT)) {
throw new IOException("The option " + DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY
@@ -571,7 +663,7 @@
}
Collection<URI> dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
- Collection<URI> editDirsToFormat =
+ List<URI> editDirsToFormat =
FSNamesystem.getNamespaceEditsDirs(conf);
for(Iterator<URI> it = dirsToFormat.iterator(); it.hasNext();) {
File curDir = new File(it.next().getPath());
@@ -605,6 +697,10 @@
private static boolean finalize(Configuration conf,
boolean isConfirmationNeeded
) throws IOException {
+ String nsId = DFSUtil.getNamenodeNameServiceId(conf);
+ String namenodeId = HAUtil.getNameNodeId(conf, nsId);
+ initializeGenericKeys(conf, nsId, namenodeId);
+
FSNamesystem nsys = new FSNamesystem(conf, new FSImage(conf));
System.err.print(
"\"finalize\" will remove the previous state of the files system.\n"
@@ -721,6 +817,14 @@
return null;
}
setStartupOption(conf, startOpt);
+
+ if (HAUtil.isHAEnabled(conf, DFSUtil.getNamenodeNameServiceId(conf)) &&
+ (startOpt == StartupOption.UPGRADE ||
+ startOpt == StartupOption.ROLLBACK ||
+ startOpt == StartupOption.FINALIZE)) {
+ throw new HadoopIllegalArgumentException("Invalid startup option. " +
+ "Cannot perform DFS upgrade with HA enabled.");
+ }
switch (startOpt) {
case FORMAT:
@@ -761,16 +865,26 @@
* @param conf
* Configuration object to lookup specific key and to set the value
* to the key passed. Note the conf object is modified
- * @param nameserviceId name service Id
- * @see DFSUtil#setGenericConf(Configuration, String, String...)
+ * @param nameserviceId name service Id (to distinguish federated NNs)
+ * @param namenodeId the namenode ID (to distinguish HA NNs)
+ * @see DFSUtil#setGenericConf(Configuration, String, String, String...)
*/
- public static void initializeGenericKeys(Configuration conf, String
- nameserviceId) {
- if ((nameserviceId == null) || nameserviceId.isEmpty()) {
+ public static void initializeGenericKeys(Configuration conf,
+ String nameserviceId, String namenodeId) {
+ if ((nameserviceId == null || nameserviceId.isEmpty()) &&
+ (namenodeId == null || namenodeId.isEmpty())) {
return;
}
- DFSUtil.setGenericConf(conf, nameserviceId, NAMESERVICE_SPECIFIC_KEYS);
+ if (nameserviceId != null) {
+ conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId);
+ }
+ if (namenodeId != null) {
+ conf.set(DFS_HA_NAMENODE_ID_KEY, namenodeId);
+ }
+
+ DFSUtil.setGenericConf(conf, nameserviceId, namenodeId,
+ NAMESERVICE_SPECIFIC_KEYS);
if (conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY) != null) {
URI defaultUri = URI.create(HdfsConstants.HDFS_URI_SCHEME + "://"
+ conf.get(DFS_NAMENODE_RPC_ADDRESS_KEY));
@@ -799,4 +913,131 @@
System.exit(-1);
}
}
+
+ synchronized void monitorHealth()
+ throws HealthCheckFailedException, AccessControlException {
+ namesystem.checkSuperuserPrivilege();
+ if (!haEnabled) {
+ return; // no-op, if HA is not enabled
+ }
+ getNamesystem().checkAvailableResources();
+ if (!getNamesystem().nameNodeHasResourcesAvailable()) {
+ throw new HealthCheckFailedException(
+ "The NameNode has no resources available");
+ }
+ }
+
+ synchronized void transitionToActive()
+ throws ServiceFailedException, AccessControlException {
+ namesystem.checkSuperuserPrivilege();
+ if (!haEnabled) {
+ throw new ServiceFailedException("HA for namenode is not enabled");
+ }
+ state.setState(haContext, ACTIVE_STATE);
+ }
+
+ synchronized void transitionToStandby()
+ throws ServiceFailedException, AccessControlException {
+ namesystem.checkSuperuserPrivilege();
+ if (!haEnabled) {
+ throw new ServiceFailedException("HA for namenode is not enabled");
+ }
+ state.setState(haContext, STANDBY_STATE);
+ }
+
+ synchronized HAServiceState getServiceState() throws AccessControlException {
+ namesystem.checkSuperuserPrivilege();
+ if (state == null) {
+ return HAServiceState.INITIALIZING;
+ }
+ return state.getServiceState();
+ }
+
+ synchronized boolean readyToBecomeActive()
+ throws ServiceFailedException, AccessControlException {
+ namesystem.checkSuperuserPrivilege();
+ if (!haEnabled) {
+ throw new ServiceFailedException("HA for namenode is not enabled");
+ }
+ return !isInSafeMode();
+ }
+
+
+ /**
+ * Class used as expose {@link NameNode} as context to {@link HAState}
+ *
+ * TODO(HA):
+ * When entering and exiting state, on failing to start services,
+ * appropriate action is needed todo either shutdown the node or recover
+ * from failure.
+ */
+ protected class NameNodeHAContext implements HAContext {
+ @Override
+ public void setState(HAState s) {
+ state = s;
+ }
+
+ @Override
+ public HAState getState() {
+ return state;
+ }
+
+ @Override
+ public void startActiveServices() throws IOException {
+ namesystem.startActiveServices();
+ startTrashEmptier(conf);
+ }
+
+ @Override
+ public void stopActiveServices() throws IOException {
+ if (namesystem != null) {
+ namesystem.stopActiveServices();
+ }
+ stopTrashEmptier();
+ }
+
+ @Override
+ public void startStandbyServices() throws IOException {
+ namesystem.startStandbyServices();
+ }
+
+ @Override
+ public void prepareToStopStandbyServices() throws ServiceFailedException {
+ namesystem.prepareToStopStandbyServices();
+ }
+
+ @Override
+ public void stopStandbyServices() throws IOException {
+ if (namesystem != null) {
+ namesystem.stopStandbyServices();
+ }
+ }
+
+ @Override
+ public void writeLock() {
+ namesystem.writeLock();
+ }
+
+ @Override
+ public void writeUnlock() {
+ namesystem.writeUnlock();
+ }
+
+ /** Check if an operation of given category is allowed */
+ @Override
+ public void checkOperation(final OperationCategory op)
+ throws StandbyException {
+ state.checkOperation(haContext, op);
+ }
+
+ @Override
+ public boolean allowStaleReads() {
+ return allowStaleStandbyReads;
+ }
+
+ }
+
+ public boolean isStandbyState() {
+ return (state.equals(STANDBY_STATE));
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java
index e4817c7..a024a55 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourceChecker.java
@@ -46,7 +46,7 @@
* are added by default, and arbitrary extra volumes may be configured as well.
*/
@InterfaceAudience.Private
-class NameNodeResourceChecker {
+public class NameNodeResourceChecker {
private static final Log LOG = LogFactory.getLog(NameNodeResourceChecker.class.getName());
// Space (in bytes) reserved per volume.
@@ -176,8 +176,7 @@
* least one redundant volume and all of the required volumes, false
* otherwise.
*/
- boolean hasAvailableDiskSpace()
- throws IOException {
+ public boolean hasAvailableDiskSpace() {
return NameNodeResourcePolicy.areResourcesAvailable(volumes.values(),
minimumRedundantVolumes);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java
index 53cd867..3896165 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java
@@ -37,9 +37,6 @@
* required to continue operation.
* @return true if and only if there are sufficient NN resources to
* continue logging edits.
- * @throws RuntimeException if the number of <bold>configured</bold>
- * redundant resources is fewer than the minimum number of available
- * redundant resources.
*/
static boolean areResourcesAvailable(
Collection<? extends CheckableNameNodeResource> resources,
@@ -63,12 +60,6 @@
}
}
- if (redundantResourceCount < minimumRedundantResources) {
- throw new RuntimeException("Need a minimum of " + minimumRedundantResources
- + " for NN to operate but only " + redundantResourceCount
- + " are configured.");
- }
-
if (redundantResourceCount == 0) {
// If there are no redundant resources, return true if there are any
// required resources available.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
index 0750600..17b387c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java
@@ -41,6 +41,13 @@
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.ha.HealthCheckFailedException;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.ha.proto.HAServiceProtocolProtos.HAServiceProtocolService;
+import org.apache.hadoop.ha.protocolPB.HAServiceProtocolPB;
+import org.apache.hadoop.ha.protocolPB.HAServiceProtocolServerSideTranslatorPB;
+
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.HdfsConfiguration;
@@ -82,6 +89,7 @@
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.IncorrectVersionException;
import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
@@ -90,6 +98,7 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.FinalizeCommand;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
@@ -104,6 +113,7 @@
import org.apache.hadoop.io.Text;
import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.ipc.RpcPayloadHeader.RpcKind;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.ipc.WritableRpcEngine;
import org.apache.hadoop.net.Node;
@@ -127,7 +137,7 @@
private static final Log stateChangeLog = NameNode.stateChangeLog;
// Dependencies from other parts of NN.
- private final FSNamesystem namesystem;
+ protected final FSNamesystem namesystem;
protected final NameNode nn;
private final NameNodeMetrics metrics;
@@ -183,6 +193,11 @@
new GetUserMappingsProtocolServerSideTranslatorPB(this);
BlockingService getUserMappingService = GetUserMappingsProtocolService
.newReflectiveBlockingService(getUserMappingXlator);
+
+ HAServiceProtocolServerSideTranslatorPB haServiceProtocolXlator =
+ new HAServiceProtocolServerSideTranslatorPB(this);
+ BlockingService haPbService = HAServiceProtocolService
+ .newReflectiveBlockingService(haServiceProtocolXlator);
WritableRpcEngine.ensureInitialized();
@@ -198,6 +213,8 @@
dnSocketAddr.getHostName(), dnSocketAddr.getPort(),
serviceHandlerCount,
false, conf, namesystem.getDelegationTokenSecretManager());
+ DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService,
+ serviceRpcServer);
DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService,
serviceRpcServer);
DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService,
@@ -221,6 +238,8 @@
clientNNPbService, socAddr.getHostName(),
socAddr.getPort(), handlerCount, false, conf,
namesystem.getDelegationTokenSecretManager());
+ DFSUtil.addPBProtocol(conf, HAServiceProtocolPB.class, haPbService,
+ clientRpcServer);
DFSUtil.addPBProtocol(conf, NamenodeProtocolPB.class, NNPbService,
clientRpcServer);
DFSUtil.addPBProtocol(conf, DatanodeProtocolPB.class, dnProtoPbService,
@@ -287,7 +306,7 @@
throw new IllegalArgumentException(
"Unexpected not positive size: "+size);
}
-
+ namesystem.checkOperation(OperationCategory.READ);
return namesystem.getBlockManager().getBlocks(datanode, size);
}
@@ -300,6 +319,7 @@
public void errorReport(NamenodeRegistration registration,
int errorCode,
String msg) throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
verifyRequest(registration);
LOG.info("Error report from " + registration + ": " + msg);
if(errorCode == FATAL)
@@ -327,9 +347,6 @@
@Override // NamenodeProtocol
public void endCheckpoint(NamenodeRegistration registration,
CheckpointSignature sig) throws IOException {
- verifyRequest(registration);
- if(!nn.isRole(NamenodeRole.NAMENODE))
- throw new IOException("Only an ACTIVE node can invoke endCheckpoint.");
namesystem.endCheckpoint(registration, sig);
}
@@ -478,10 +495,10 @@
return namesystem.getAdditionalDatanode(src, blk,
existings, excludeSet, numAdditionalNodes, clientName);
}
-
/**
* The client needs to give up on the block.
*/
+ @Override // ClientProtocol
public void abandonBlock(ExtendedBlock b, String src, String holder)
throws IOException {
if(stateChangeLog.isDebugEnabled()) {
@@ -509,18 +526,9 @@
* mark the block as corrupt. In the future we might
* check the blocks are actually corrupt.
*/
- @Override
+ @Override // ClientProtocol, DatanodeProtocol
public void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
- stateChangeLog.info("*DIR* NameNode.reportBadBlocks");
- for (int i = 0; i < blocks.length; i++) {
- ExtendedBlock blk = blocks[i].getBlock();
- DatanodeInfo[] nodes = blocks[i].getLocations();
- for (int j = 0; j < nodes.length; j++) {
- DatanodeInfo dn = nodes[j];
- namesystem.getBlockManager().findAndMarkBlockAsCorrupt(blk, dn,
- "client machine reported it");
- }
- }
+ namesystem.reportBadBlocks(blocks);
}
@Override // ClientProtocol
@@ -633,8 +641,7 @@
@Override // ClientProtocol
public DirectoryListing getListing(String src, byte[] startAfter,
- boolean needLocation)
- throws IOException {
+ boolean needLocation) throws IOException {
DirectoryListing files = namesystem.getListing(
src, startAfter, needLocation);
if (files != null) {
@@ -656,14 +663,16 @@
return namesystem.getFileInfo(src, false);
}
- @Override
- public long[] getStats() {
+ @Override // ClientProtocol
+ public long[] getStats() throws IOException {
+ namesystem.checkOperation(OperationCategory.READ);
return namesystem.getStats();
}
@Override // ClientProtocol
public DatanodeInfo[] getDatanodeReport(DatanodeReportType type)
- throws IOException {
+ throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
DatanodeInfo results[] = namesystem.datanodeReport(type);
if (results == null ) {
throw new IOException("Cannot find datanode report");
@@ -673,28 +682,32 @@
@Override // ClientProtocol
public boolean setSafeMode(SafeModeAction action) throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
return namesystem.setSafeMode(action);
}
@Override // ClientProtocol
- public boolean restoreFailedStorage(String arg)
- throws AccessControlException {
+ public boolean restoreFailedStorage(String arg) throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
return namesystem.restoreFailedStorage(arg);
}
@Override // ClientProtocol
public void saveNamespace() throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.saveNamespace();
}
@Override // ClientProtocol
public void refreshNodes() throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.getBlockManager().getDatanodeManager().refreshNodes(
new HdfsConfiguration());
}
@Override // NamenodeProtocol
- public long getTransactionID() {
+ public long getTransactionID() throws IOException {
+ namesystem.checkOperation(OperationCategory.CHECKPOINT);
return namesystem.getEditLog().getSyncTxId();
}
@@ -703,32 +716,36 @@
return namesystem.rollEditLog();
}
- @Override
+ @Override // NamenodeProtocol
public RemoteEditLogManifest getEditLogManifest(long sinceTxId)
throws IOException {
+ namesystem.checkOperation(OperationCategory.READ);
return namesystem.getEditLog().getEditLogManifest(sinceTxId);
}
@Override // ClientProtocol
public void finalizeUpgrade() throws IOException {
+ namesystem.checkOperation(OperationCategory.WRITE);
namesystem.finalizeUpgrade();
}
@Override // ClientProtocol
public UpgradeStatusReport distributedUpgradeProgress(UpgradeAction action)
throws IOException {
+ namesystem.checkOperation(OperationCategory.READ);
return namesystem.distributedUpgradeProgress(action);
}
@Override // ClientProtocol
public void metaSave(String filename) throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.metaSave(filename);
}
@Override // ClientProtocol
public CorruptFileBlocks listCorruptFileBlocks(String path, String cookie)
throws IOException {
- String[] cookieTab = new String[] { cookie };
+ String[] cookieTab = new String[] { cookie };
Collection<FSNamesystem.CorruptFileBlockInfo> fbs =
namesystem.listCorruptFileBlocks(path, cookieTab);
@@ -743,11 +760,12 @@
/**
* Tell all datanodes to use a new, non-persistent bandwidth value for
* dfs.datanode.balance.bandwidthPerSec.
- * @param bandwidth Blanacer bandwidth in bytes per second for all datanodes.
+ * @param bandwidth Balancer bandwidth in bytes per second for all datanodes.
* @throws IOException
*/
@Override // ClientProtocol
public void setBalancerBandwidth(long bandwidth) throws IOException {
+ namesystem.checkOperation(OperationCategory.UNCHECKED);
namesystem.getBlockManager().getDatanodeManager().setBalancerBandwidth(bandwidth);
}
@@ -822,7 +840,7 @@
}
@Override // DatanodeProtocol
- public DatanodeCommand[] sendHeartbeat(DatanodeRegistration nodeReg,
+ public HeartbeatResponse sendHeartbeat(DatanodeRegistration nodeReg,
StorageReport[] report, int xmitsInProgress, int xceiverCount,
int failedVolumes) throws IOException {
verifyRequest(nodeReg);
@@ -844,7 +862,7 @@
}
namesystem.getBlockManager().processReport(nodeReg, poolId, blist);
- if (nn.getFSImage().isUpgradeFinalized())
+ if (nn.getFSImage().isUpgradeFinalized() && !nn.isStandbyState())
return new FinalizeCommand(poolId);
return null;
}
@@ -858,7 +876,7 @@
+"from "+nodeReg.getName()+" "+receivedAndDeletedBlocks.length
+" blocks.");
}
- namesystem.getBlockManager().blockReceivedAndDeleted(
+ namesystem.getBlockManager().processIncrementalBlockReport(
nodeReg, poolId, receivedAndDeletedBlocks[0].getBlocks());
}
@@ -946,6 +964,35 @@
return UserGroupInformation.createRemoteUser(user).getGroupNames();
}
+ @Override // HAServiceProtocol
+ public synchronized void monitorHealth()
+ throws HealthCheckFailedException, AccessControlException {
+ nn.monitorHealth();
+ }
+
+ @Override // HAServiceProtocol
+ public synchronized void transitionToActive()
+ throws ServiceFailedException, AccessControlException {
+ nn.transitionToActive();
+ }
+
+ @Override // HAServiceProtocol
+ public synchronized void transitionToStandby()
+ throws ServiceFailedException, AccessControlException {
+ nn.transitionToStandby();
+ }
+
+ @Override // HAServiceProtocol
+ public synchronized HAServiceState getServiceState()
+ throws AccessControlException {
+ return nn.getServiceState();
+ }
+
+ @Override // HAServiceProtocol
+ public synchronized boolean readyToBecomeActive()
+ throws ServiceFailedException, AccessControlException {
+ return nn.readyToBecomeActive();
+ }
/**
* Verify version.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java
index 64b2723..44c0751 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java
@@ -36,6 +36,7 @@
import javax.servlet.jsp.JspWriter;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.Block;
@@ -309,7 +310,16 @@
long bpUsed = fsnStats[6];
float percentBpUsed = DFSUtil.getPercentUsed(bpUsed, total);
-
+
+ // don't show under-replicated/missing blocks or corrupt files for SBN
+ // since the standby namenode doesn't compute replication queues
+ String underReplicatedBlocks = "";
+ if (nn.getServiceState() == HAServiceState.ACTIVE) {
+ underReplicatedBlocks = rowTxt()
+ + colTxt("Excludes missing blocks.")
+ + "Number of Under-Replicated Blocks" + colTxt() + ":" + colTxt()
+ + fsn.getBlockManager().getUnderReplicatedNotMissingBlocks();
+ }
out.print("<div id=\"dfstable\"> <table>\n" + rowTxt() + colTxt()
+ "Configured Capacity" + colTxt() + ":" + colTxt()
+ StringUtils.byteDesc(total) + rowTxt() + colTxt() + "DFS Used"
@@ -344,10 +354,8 @@
+ rowTxt() + colTxt()
+ "<a href=\"dfsnodelist.jsp?whatNodes=DECOMMISSIONING\">"
+ "Decommissioning Nodes</a> "
- + colTxt() + ":" + colTxt() + decommissioning.size()
- + rowTxt() + colTxt("Excludes missing blocks.")
- + "Number of Under-Replicated Blocks" + colTxt() + ":" + colTxt()
- + fsn.getBlockManager().getUnderReplicatedNotMissingBlocks()
+ + colTxt() + ":" + colTxt() + decommissioning.size()
+ + underReplicatedBlocks
+ "</table></div><br>\n");
if (live.isEmpty() && dead.isEmpty()) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java
index 6846e95..c453db5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Namesystem.java
@@ -32,4 +32,10 @@
/** @return the block pool ID */
public String getBlockPoolId();
+
+ public boolean isInStandbyState();
+
+ public boolean isGenStampInFuture(long generationStamp);
+
+ public void adjustSafeModeBlockTotals(int deltaSafe, int deltaTotal);
}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java
index 2731275..5b49f0e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceCancelledException.java
@@ -18,8 +18,10 @@
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
+import org.apache.hadoop.classification.InterfaceAudience;;
-class SaveNamespaceCancelledException extends IOException {
+@InterfaceAudience.Private
+public class SaveNamespaceCancelledException extends IOException {
private static final long serialVersionUID = 1L;
SaveNamespaceCancelledException(String cancelReason) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
index 50dca62..c1ce79e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
@@ -45,10 +45,11 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.DFSUtil.ErrorSimulator;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
-import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.JspHelper;
@@ -112,17 +113,9 @@
private String infoBindAddress;
private Collection<URI> checkpointDirs;
- private Collection<URI> checkpointEditsDirs;
-
- /** How often to checkpoint regardless of number of txns */
- private long checkpointPeriod; // in seconds
-
- /** How often to poll the NN to check checkpointTxnCount */
- private long checkpointCheckPeriod; // in seconds
-
- /** checkpoint once every this many transactions, regardless of time */
- private long checkpointTxnCount;
+ private List<URI> checkpointEditsDirs;
+ private CheckpointConf checkpointConf;
private FSNamesystem namesystem;
@@ -132,9 +125,9 @@
+ "\nName Node Address : " + nameNodeAddr
+ "\nStart Time : " + new Date(starttime)
+ "\nLast Checkpoint Time : " + (lastCheckpointTime == 0? "--": new Date(lastCheckpointTime))
- + "\nCheckpoint Period : " + checkpointPeriod + " seconds"
- + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointTxnCount)
- + " (= " + checkpointTxnCount + " bytes)"
+ + "\nCheckpoint Period : " + checkpointConf.getPeriod() + " seconds"
+ + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointConf.getTxnCount())
+ + " (= " + checkpointConf.getTxnCount() + " bytes)"
+ "\nCheckpoint Dirs : " + checkpointDirs
+ "\nCheckpoint Edits Dirs: " + checkpointEditsDirs;
}
@@ -174,16 +167,19 @@
public SecondaryNameNode(Configuration conf,
CommandLineOpts commandLineOpts) throws IOException {
try {
- NameNode.initializeGenericKeys(conf,
- DFSUtil.getSecondaryNameServiceId(conf));
+ String nsId = DFSUtil.getSecondaryNameServiceId(conf);
+ if (HAUtil.isHAEnabled(conf, nsId)) {
+ throw new IOException(
+ "Cannot use SecondaryNameNode in an HA cluster." +
+ " The Standby Namenode will perform checkpointing.");
+ }
+ NameNode.initializeGenericKeys(conf, nsId, null);
initialize(conf, commandLineOpts);
- } catch(IOException e) {
+ } catch (IOException e) {
shutdown();
- LOG.fatal("Failed to start secondary namenode. ", e);
throw e;
- } catch(HadoopIllegalArgumentException e) {
+ } catch (HadoopIllegalArgumentException e) {
shutdown();
- LOG.fatal("Failed to start secondary namenode. ", e);
throw e;
}
}
@@ -216,8 +212,9 @@
nameNodeAddr = NameNode.getServiceAddress(conf, true);
this.conf = conf;
- this.namenode = new NamenodeProtocolTranslatorPB(nameNodeAddr, conf,
- UserGroupInformation.getCurrentUser());
+ this.namenode = NameNodeProxies.createNonHAProxy(conf, nameNodeAddr,
+ NamenodeProtocol.class, UserGroupInformation.getCurrentUser(),
+ true).getProxy();
// initialize checkpoint directories
fsName = getInfoServer();
@@ -231,16 +228,8 @@
namesystem = new FSNamesystem(conf, checkpointImage);
// Initialize other scheduling parameters from the configuration
- checkpointCheckPeriod = conf.getLong(
- DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY,
- DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT);
-
- checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
- DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
- checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
- DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
- warnForDeprecatedConfigs(conf);
-
+ checkpointConf = new CheckpointConf(conf);
+
// initialize the webserver for uploading files.
// Kerberized SSL servers must be run from the host principal...
UserGroupInformation httpUGI =
@@ -296,21 +285,9 @@
conf.set(DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, infoBindAddress + ":" +infoPort);
LOG.info("Secondary Web-server up at: " + infoBindAddress + ":" +infoPort);
LOG.info("Secondary image servlet up at: " + infoBindAddress + ":" + imagePort);
- LOG.info("Checkpoint Period :" + checkpointPeriod + " secs " +
- "(" + checkpointPeriod/60 + " min)");
- LOG.info("Log Size Trigger :" + checkpointTxnCount + " txns");
- }
-
- static void warnForDeprecatedConfigs(Configuration conf) {
- for (String key : ImmutableList.of(
- "fs.checkpoint.size",
- "dfs.namenode.checkpoint.size")) {
- if (conf.get(key) != null) {
- LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." +
- " Instead please specify a value for " +
- DFS_NAMENODE_CHECKPOINT_TXNS_KEY);
- }
- }
+ LOG.info("Checkpoint Period :" + checkpointConf.getPeriod() + " secs " +
+ "(" + checkpointConf.getPeriod()/60 + " min)");
+ LOG.info("Log Size Trigger :" + checkpointConf.getTxnCount() + " txns");
}
/**
@@ -332,36 +309,24 @@
}
public void run() {
- if (UserGroupInformation.isSecurityEnabled()) {
- UserGroupInformation ugi = null;
- try {
- ugi = UserGroupInformation.getLoginUser();
- } catch (IOException e) {
- LOG.error("Exception while getting login user", e);
- e.printStackTrace();
- Runtime.getRuntime().exit(-1);
- }
- ugi.doAs(new PrivilegedAction<Object>() {
+ SecurityUtil.doAsLoginUserOrFatal(
+ new PrivilegedAction<Object>() {
@Override
public Object run() {
doWork();
return null;
}
});
- } else {
- doWork();
- }
}
//
// The main work loop
//
public void doWork() {
-
//
// Poll the Namenode (once every checkpointCheckPeriod seconds) to find the
// number of transactions in the edit log that haven't yet been checkpointed.
//
- long period = Math.min(checkpointCheckPeriod, checkpointPeriod);
+ long period = checkpointConf.getCheckPeriod();
while (shouldRun) {
try {
@@ -380,7 +345,7 @@
long now = System.currentTimeMillis();
if (shouldCheckpointBasedOnCount() ||
- now >= lastCheckpointTime + 1000 * checkpointPeriod) {
+ now >= lastCheckpointTime + 1000 * checkpointConf.getPeriod()) {
doCheckpoint();
lastCheckpointTime = now;
}
@@ -471,19 +436,10 @@
}
String configuredAddress = DFSUtil.getInfoServer(null, conf, true);
- InetSocketAddress sockAddr = NetUtils.createSocketAddr(configuredAddress);
- if (sockAddr.getAddress().isAnyLocalAddress()) {
- if(UserGroupInformation.isSecurityEnabled()) {
- throw new IOException("Cannot use a wildcard address with security. " +
- "Must explicitly set bind address for Kerberos");
- }
- return fsName.getHost() + ":" + sockAddr.getPort();
- } else {
- if(LOG.isDebugEnabled()) {
- LOG.debug("configuredAddress = " + configuredAddress);
- }
- return configuredAddress;
- }
+ String address = DFSUtil.substituteForWildcardAddress(configuredAddress,
+ fsName.getHost());
+ LOG.debug("Will connect to NameNode at HTTP address: " + address);
+ return address;
}
/**
@@ -574,13 +530,13 @@
switch (opts.getCommand()) {
case CHECKPOINT:
long count = countUncheckpointedTxns();
- if (count > checkpointTxnCount ||
+ if (count > checkpointConf.getTxnCount() ||
opts.shouldForceCheckpoint()) {
doCheckpoint();
} else {
System.err.println("EditLog size " + count + " transactions is " +
"smaller than configured checkpoint " +
- "interval " + checkpointTxnCount + " transactions.");
+ "interval " + checkpointConf.getTxnCount() + " transactions.");
System.err.println("Skipping checkpoint.");
}
break;
@@ -626,7 +582,7 @@
}
boolean shouldCheckpointBasedOnCount() throws IOException {
- return countUncheckpointedTxns() >= checkpointTxnCount;
+ return countUncheckpointedTxns() >= checkpointConf.getTxnCount();
}
/**
@@ -642,7 +598,13 @@
StringUtils.startupShutdownMessage(SecondaryNameNode.class, argv, LOG);
Configuration tconf = new HdfsConfiguration();
- SecondaryNameNode secondary = new SecondaryNameNode(tconf, opts);
+ SecondaryNameNode secondary = null;
+ try {
+ secondary = new SecondaryNameNode(tconf, opts);
+ } catch (IOException ioe) {
+ LOG.fatal("Failed to start secondary namenode", ioe);
+ System.exit(-1);
+ }
if (opts.getCommand() != null) {
int ret = secondary.processStartupCommand(opts);
@@ -759,7 +721,7 @@
*/
CheckpointStorage(Configuration conf,
Collection<URI> imageDirs,
- Collection<URI> editsDirs) throws IOException {
+ List<URI> editsDirs) throws IOException {
super(conf, imageDirs, editsDirs);
// the 2NN never writes edits -- it only downloads them. So
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
index cc8dcca..985d85b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java
@@ -24,8 +24,11 @@
import java.util.List;
import java.lang.Math;
+import javax.servlet.http.HttpServletResponse;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
@@ -41,7 +44,8 @@
/**
* This class provides fetching a specified file from the NameNode.
*/
-class TransferFsImage {
+@InterfaceAudience.Private
+public class TransferFsImage {
public final static String CONTENT_LENGTH = "Content-Length";
public final static String MD5_HEADER = "X-MD5-Digest";
@@ -103,7 +107,7 @@
* @param storage the storage directory to transfer the image from
* @param txid the transaction ID of the image to be uploaded
*/
- static void uploadImageFromStorage(String fsName,
+ public static void uploadImageFromStorage(String fsName,
InetSocketAddress imageListenAddress,
NNStorage storage, long txid) throws IOException {
@@ -111,7 +115,20 @@
txid, imageListenAddress, storage);
// this doesn't directly upload an image, but rather asks the NN
// to connect back to the 2NN to download the specified image.
- TransferFsImage.getFileClient(fsName, fileid, null, null, false);
+ try {
+ TransferFsImage.getFileClient(fsName, fileid, null, null, false);
+ } catch (HttpGetFailedException e) {
+ if (e.getResponseCode() == HttpServletResponse.SC_CONFLICT) {
+ // this is OK - this means that a previous attempt to upload
+ // this checkpoint succeeded even though we thought it failed.
+ LOG.info("Image upload with txid " + txid +
+ " conflicted with a previous image upload to the " +
+ "same NameNode. Continuing...", e);
+ return;
+ } else {
+ throw e;
+ }
+ }
LOG.info("Uploaded image with txid " + txid + " to namenode at " +
fsName);
}
@@ -194,10 +211,11 @@
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) {
- throw new IOException(
+ throw new HttpGetFailedException(
"Image transfer servlet at " + url +
" failed with status code " + connection.getResponseCode() +
- "\nResponse message:\n" + connection.getResponseMessage());
+ "\nResponse message:\n" + connection.getResponseMessage(),
+ connection);
}
long advertisedSize;
@@ -289,5 +307,19 @@
String header = connection.getHeaderField(MD5_HEADER);
return (header != null) ? new MD5Hash(header) : null;
}
+
+ public static class HttpGetFailedException extends IOException {
+ private static final long serialVersionUID = 1L;
+ private final int responseCode;
+
+ HttpGetFailedException(String msg, HttpURLConnection connection) throws IOException {
+ super(msg);
+ this.responseCode = connection.getResponseCode();
+ }
+
+ public int getResponseCode() {
+ return responseCode;
+ }
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java
index 9ac17fc..ca7e1d7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/UnsupportedActionException.java
@@ -32,8 +32,7 @@
/** for java.io.Serializable */
private static final long serialVersionUID = 1L;
- public UnsupportedActionException(String action) {
- super("Action " + action + "() is not supported.");
+ public UnsupportedActionException(String msg) {
+ super(msg);
}
-
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java
new file mode 100644
index 0000000..a61e134
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ActiveState.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
+
+/**
+ * Active state of the namenode. In this state, namenode provides the namenode
+ * service and handles operations of type {@link OperationCategory#WRITE} and
+ * {@link OperationCategory#READ}.
+ */
+@InterfaceAudience.Private
+public class ActiveState extends HAState {
+ public ActiveState() {
+ super(HAServiceState.ACTIVE);
+ }
+
+ @Override
+ public void checkOperation(HAContext context, OperationCategory op) {
+ return; // Other than journal all operations are allowed in active state
+ }
+
+ @Override
+ public boolean shouldPopulateReplQueues() {
+ return true;
+ }
+
+ @Override
+ public void setState(HAContext context, HAState s) throws ServiceFailedException {
+ if (s == NameNode.STANDBY_STATE) {
+ setStateInternal(context, s);
+ return;
+ }
+ super.setState(context, s);
+ }
+
+ @Override
+ public void enterState(HAContext context) throws ServiceFailedException {
+ try {
+ context.startActiveServices();
+ } catch (IOException e) {
+ throw new ServiceFailedException("Failed to start active services", e);
+ }
+ }
+
+ @Override
+ public void exitState(HAContext context) throws ServiceFailedException {
+ try {
+ context.stopActiveServices();
+ } catch (IOException e) {
+ throw new ServiceFailedException("Failed to stop active services", e);
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java
new file mode 100644
index 0000000..a20880a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/ConfiguredFailoverProxyProvider.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.NameNodeProxies;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
+import org.apache.hadoop.io.retry.FailoverProxyProvider;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A FailoverProxyProvider implementation which allows one to configure two URIs
+ * to connect to during fail-over. The first configured address is tried first,
+ * and on a fail-over event the other address is tried.
+ */
+public class ConfiguredFailoverProxyProvider<T> implements
+ FailoverProxyProvider<T> {
+
+ private static final Log LOG =
+ LogFactory.getLog(ConfiguredFailoverProxyProvider.class);
+
+ private final Configuration conf;
+ private final List<AddressRpcProxyPair<T>> proxies =
+ new ArrayList<AddressRpcProxyPair<T>>();
+ private final UserGroupInformation ugi;
+ private final Class<T> xface;
+
+ private int currentProxyIndex = 0;
+
+ public ConfiguredFailoverProxyProvider(Configuration conf, URI uri,
+ Class<T> xface) {
+ Preconditions.checkArgument(
+ xface.isAssignableFrom(NamenodeProtocols.class),
+ "Interface class %s is not a valid NameNode protocol!");
+ this.xface = xface;
+
+ this.conf = new Configuration(conf);
+ int maxRetries = this.conf.getInt(
+ DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_KEY,
+ DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_DEFAULT);
+ this.conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
+ maxRetries);
+
+ int maxRetriesOnSocketTimeouts = this.conf.getInt(
+ DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ DFSConfigKeys.DFS_CLIENT_FAILOVER_CONNECTION_RETRIES_ON_SOCKET_TIMEOUTS_DEFAULT);
+ this.conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+ maxRetriesOnSocketTimeouts);
+
+ try {
+ ugi = UserGroupInformation.getCurrentUser();
+
+ Map<String, Map<String, InetSocketAddress>> map = DFSUtil.getHaNnRpcAddresses(
+ conf);
+ Map<String, InetSocketAddress> addressesInNN = map.get(uri.getHost());
+
+ if (addressesInNN == null || addressesInNN.size() == 0) {
+ throw new RuntimeException("Could not find any configured addresses " +
+ "for URI " + uri);
+ }
+
+ for (InetSocketAddress address : addressesInNN.values()) {
+ proxies.add(new AddressRpcProxyPair<T>(address));
+
+ // The client may have a delegation token set for the logical
+ // URI of the cluster. Clone this token to apply to each of the
+ // underlying IPC addresses so that the IPC code can find it.
+ HAUtil.cloneDelegationTokenForLogicalUri(ugi, uri, address);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public Class<T> getInterface() {
+ return xface;
+ }
+
+ /**
+ * Lazily initialize the RPC proxy object.
+ */
+ @SuppressWarnings("unchecked")
+ @Override
+ public synchronized T getProxy() {
+ AddressRpcProxyPair current = proxies.get(currentProxyIndex);
+ if (current.namenode == null) {
+ try {
+ current.namenode = NameNodeProxies.createNonHAProxy(conf,
+ current.address, xface, ugi, false).getProxy();
+ } catch (IOException e) {
+ LOG.error("Failed to create RPC proxy to NameNode", e);
+ throw new RuntimeException(e);
+ }
+ }
+ return (T)current.namenode;
+ }
+
+ @Override
+ public synchronized void performFailover(T currentProxy) {
+ currentProxyIndex = (currentProxyIndex + 1) % proxies.size();
+ }
+
+ /**
+ * A little pair object to store the address and connected RPC proxy object to
+ * an NN. Note that {@link AddressRpcProxyPair#namenode} may be null.
+ */
+ private static class AddressRpcProxyPair<T> {
+ public InetSocketAddress address;
+ public T namenode;
+
+ public AddressRpcProxyPair(InetSocketAddress address) {
+ this.address = address;
+ }
+ }
+
+ /**
+ * Close all the proxy objects which have been opened over the lifetime of
+ * this proxy provider.
+ */
+ @Override
+ public synchronized void close() throws IOException {
+ for (AddressRpcProxyPair<T> proxy : proxies) {
+ if (proxy.namenode != null) {
+ if (proxy.namenode instanceof Closeable) {
+ ((Closeable)proxy.namenode).close();
+ } else {
+ RPC.stopProxy(proxy.namenode);
+ }
+ }
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
new file mode 100644
index 0000000..780bad7
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
@@ -0,0 +1,339 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.security.PrivilegedAction;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolPB;
+import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
+import org.apache.hadoop.hdfs.server.namenode.EditLogInputException;
+import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.security.SecurityUtil;
+
+import static org.apache.hadoop.hdfs.server.common.Util.now;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+
+
+/**
+ * EditLogTailer represents a thread which periodically reads from edits
+ * journals and applies the transactions contained within to a given
+ * FSNamesystem.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class EditLogTailer {
+ public static final Log LOG = LogFactory.getLog(EditLogTailer.class);
+
+ private final EditLogTailerThread tailerThread;
+
+ private final FSNamesystem namesystem;
+ private FSEditLog editLog;
+
+ private volatile Runtime runtime = Runtime.getRuntime();
+
+ private InetSocketAddress activeAddr;
+ private NamenodeProtocol cachedActiveProxy = null;
+
+ /**
+ * The last transaction ID at which an edit log roll was initiated.
+ */
+ private long lastRollTriggerTxId = HdfsConstants.INVALID_TXID;
+
+ /**
+ * The highest transaction ID loaded by the Standby.
+ */
+ private long lastLoadedTxnId = HdfsConstants.INVALID_TXID;
+
+ /**
+ * The last time we successfully loaded a non-zero number of edits from the
+ * shared directory.
+ */
+ private long lastLoadTimestamp;
+
+ /**
+ * How often the Standby should roll edit logs. Since the Standby only reads
+ * from finalized log segments, the Standby will only be as up-to-date as how
+ * often the logs are rolled.
+ */
+ private long logRollPeriodMs;
+
+ /**
+ * How often the Standby should check if there are new finalized segment(s)
+ * available to be read from.
+ */
+ private long sleepTimeMs;
+
+ public EditLogTailer(FSNamesystem namesystem) {
+ this.tailerThread = new EditLogTailerThread();
+ this.namesystem = namesystem;
+ this.editLog = namesystem.getEditLog();
+
+
+ Configuration conf = namesystem.getConf();
+ lastLoadTimestamp = now();
+
+ logRollPeriodMs = conf.getInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY,
+ DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_DEFAULT) * 1000;
+ if (logRollPeriodMs >= 0) {
+ this.activeAddr = getActiveNodeAddress();
+ Preconditions.checkArgument(activeAddr.getPort() > 0,
+ "Active NameNode must have an IPC port configured. " +
+ "Got address '%s'", activeAddr);
+ LOG.info("Will roll logs on active node at " + activeAddr + " every " +
+ (logRollPeriodMs / 1000) + " seconds.");
+ } else {
+ LOG.info("Not going to trigger log rolls on active node because " +
+ DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY + " is negative.");
+ }
+
+ sleepTimeMs = conf.getInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY,
+ DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_DEFAULT) * 1000;
+
+ LOG.debug("logRollPeriodMs=" + logRollPeriodMs +
+ " sleepTime=" + sleepTimeMs);
+ }
+
+ private InetSocketAddress getActiveNodeAddress() {
+ Configuration conf = namesystem.getConf();
+ Configuration activeConf = HAUtil.getConfForOtherNode(conf);
+ return NameNode.getServiceAddress(activeConf, true);
+ }
+
+ private NamenodeProtocol getActiveNodeProxy() throws IOException {
+ if (cachedActiveProxy == null) {
+ Configuration conf = namesystem.getConf();
+ NamenodeProtocolPB proxy =
+ RPC.waitForProxy(NamenodeProtocolPB.class,
+ RPC.getProtocolVersion(NamenodeProtocolPB.class), activeAddr, conf);
+ cachedActiveProxy = new NamenodeProtocolTranslatorPB(proxy);
+ }
+ assert cachedActiveProxy != null;
+ return cachedActiveProxy;
+ }
+
+ public void start() {
+ tailerThread.start();
+ }
+
+ public void stop() throws IOException {
+ tailerThread.setShouldRun(false);
+ tailerThread.interrupt();
+ try {
+ tailerThread.join();
+ } catch (InterruptedException e) {
+ LOG.warn("Edit log tailer thread exited with an exception");
+ throw new IOException(e);
+ }
+ }
+
+ @VisibleForTesting
+ FSEditLog getEditLog() {
+ return editLog;
+ }
+
+ @VisibleForTesting
+ void setEditLog(FSEditLog editLog) {
+ this.editLog = editLog;
+ }
+
+ @VisibleForTesting
+ synchronized void setRuntime(Runtime runtime) {
+ this.runtime = runtime;
+ }
+
+ public void catchupDuringFailover() throws IOException {
+ Preconditions.checkState(tailerThread == null ||
+ !tailerThread.isAlive(),
+ "Tailer thread should not be running once failover starts");
+ try {
+ doTailEdits();
+ } catch (InterruptedException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private void doTailEdits() throws IOException, InterruptedException {
+ // Write lock needs to be interruptible here because the
+ // transitionToActive RPC takes the write lock before calling
+ // tailer.stop() -- so if we're not interruptible, it will
+ // deadlock.
+ namesystem.writeLockInterruptibly();
+ try {
+ FSImage image = namesystem.getFSImage();
+
+ long lastTxnId = image.getLastAppliedTxId();
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("lastTxnId: " + lastTxnId);
+ }
+ Collection<EditLogInputStream> streams;
+ try {
+ streams = editLog.selectInputStreams(lastTxnId + 1, 0, false);
+ } catch (IOException ioe) {
+ // This is acceptable. If we try to tail edits in the middle of an edits
+ // log roll, i.e. the last one has been finalized but the new inprogress
+ // edits file hasn't been started yet.
+ LOG.warn("Edits tailer failed to find any streams. Will try again " +
+ "later.", ioe);
+ return;
+ }
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("edit streams to load from: " + streams.size());
+ }
+
+ // Once we have streams to load, errors encountered are legitimate cause
+ // for concern, so we don't catch them here. Simple errors reading from
+ // disk are ignored.
+ long editsLoaded = 0;
+ try {
+ editsLoaded = image.loadEdits(streams, namesystem);
+ } catch (EditLogInputException elie) {
+ editsLoaded = elie.getNumEditsLoaded();
+ throw elie;
+ } finally {
+ if (editsLoaded > 0 || LOG.isDebugEnabled()) {
+ LOG.info(String.format("Loaded %d edits starting from txid %d ",
+ editsLoaded, lastTxnId));
+ }
+ }
+
+ if (editsLoaded > 0) {
+ lastLoadTimestamp = now();
+ }
+ lastLoadedTxnId = image.getLastAppliedTxId();
+ } finally {
+ namesystem.writeUnlock();
+ }
+ }
+
+ /**
+ * @return timestamp (in msec) of when we last loaded a non-zero number of edits.
+ */
+ public long getLastLoadTimestamp() {
+ return lastLoadTimestamp;
+ }
+
+ /**
+ * @return true if the configured log roll period has elapsed.
+ */
+ private boolean tooLongSinceLastLoad() {
+ return logRollPeriodMs >= 0 &&
+ (now() - lastLoadTimestamp) > logRollPeriodMs ;
+ }
+
+ /**
+ * Trigger the active node to roll its logs.
+ */
+ private void triggerActiveLogRoll() {
+ LOG.info("Triggering log roll on remote NameNode " + activeAddr);
+ try {
+ getActiveNodeProxy().rollEditLog();
+ lastRollTriggerTxId = lastLoadedTxnId;
+ } catch (IOException ioe) {
+ LOG.warn("Unable to trigger a roll of the active NN", ioe);
+ }
+ }
+
+ /**
+ * The thread which does the actual work of tailing edits journals and
+ * applying the transactions to the FSNS.
+ */
+ private class EditLogTailerThread extends Thread {
+ private volatile boolean shouldRun = true;
+
+ private EditLogTailerThread() {
+ super("Edit log tailer");
+ }
+
+ private void setShouldRun(boolean shouldRun) {
+ this.shouldRun = shouldRun;
+ }
+
+ @Override
+ public void run() {
+ SecurityUtil.doAsLoginUserOrFatal(
+ new PrivilegedAction<Object>() {
+ @Override
+ public Object run() {
+ doWork();
+ return null;
+ }
+ });
+ }
+
+ private void doWork() {
+ while (shouldRun) {
+ try {
+ // There's no point in triggering a log roll if the Standby hasn't
+ // read any more transactions since the last time a roll was
+ // triggered.
+ if (tooLongSinceLastLoad() &&
+ lastRollTriggerTxId < lastLoadedTxnId) {
+ triggerActiveLogRoll();
+ }
+ /**
+ * Check again in case someone calls {@link EditLogTailer#stop} while
+ * we're triggering an edit log roll, since ipc.Client catches and
+ * ignores {@link InterruptedException} in a few places. This fixes
+ * the bug described in HDFS-2823.
+ */
+ if (!shouldRun) {
+ break;
+ }
+ doTailEdits();
+ } catch (EditLogInputException elie) {
+ LOG.warn("Error while reading edits from disk. Will try again.", elie);
+ } catch (InterruptedException ie) {
+ // interrupter should have already set shouldRun to false
+ continue;
+ } catch (Throwable t) {
+ LOG.error("Unknown error encountered while tailing edits. " +
+ "Shutting down standby NN.", t);
+ runtime.exit(1);
+ }
+
+ try {
+ Thread.sleep(sleepTimeMs);
+ } catch (InterruptedException e) {
+ LOG.warn("Edit log tailer interrupted", e);
+ }
+ }
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java
new file mode 100644
index 0000000..6b070b2
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAContext.java
@@ -0,0 +1,61 @@
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
+import org.apache.hadoop.ipc.StandbyException;
+
+/**
+ * Context that is to be used by {@link HAState} for getting/setting the
+ * current state and performing required operations.
+ */
+@InterfaceAudience.Private
+public interface HAContext {
+ /** Set the state of the context to given {@code state} */
+ public void setState(HAState state);
+
+ /** Get the state from the context */
+ public HAState getState();
+
+ /** Start the services required in active state */
+ public void startActiveServices() throws IOException;
+
+ /** Stop the services when exiting active state */
+ public void stopActiveServices() throws IOException;
+
+ /** Start the services required in standby state */
+ public void startStandbyServices() throws IOException;
+
+ /** Prepare to exit the standby state */
+ public void prepareToStopStandbyServices() throws ServiceFailedException;
+
+ /** Stop the services when exiting standby state */
+ public void stopStandbyServices() throws IOException;
+
+ /**
+ * Take a write-lock on the underlying namesystem
+ * so that no concurrent state transitions or edits
+ * can be made.
+ */
+ void writeLock();
+
+ /**
+ * Unlock the lock taken by {@link #writeLock()}
+ */
+ void writeUnlock();
+
+ /**
+ * Verify that the given operation category is allowed in the
+ * current state. This is to allow NN implementations (eg BackupNode)
+ * to override it with node-specific handling.
+ */
+ void checkOperation(OperationCategory op) throws StandbyException;
+
+ /**
+ * @return true if the node should allow stale reads (ie reads
+ * while the namespace is not up to date)
+ */
+ boolean allowStaleReads();
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java
new file mode 100644
index 0000000..20ea854
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/HAState.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
+import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;
+import org.apache.hadoop.ipc.StandbyException;
+
+/**
+ * Namenode base state to implement state machine pattern.
+ */
+@InterfaceAudience.Private
+abstract public class HAState {
+ protected final HAServiceState state;
+
+ /**
+ * Constructor
+ * @param name Name of the state.
+ */
+ public HAState(HAServiceState state) {
+ this.state = state;
+ }
+
+ /**
+ * @return the generic service state
+ */
+ public HAServiceState getServiceState() {
+ return state;
+ }
+
+ /**
+ * Internal method to transition the state of a given namenode to a new state.
+ * @param nn Namenode
+ * @param s new state
+ * @throws ServiceFailedException on failure to transition to new state.
+ */
+ protected final void setStateInternal(final HAContext context, final HAState s)
+ throws ServiceFailedException {
+ prepareToExitState(context);
+ s.prepareToEnterState(context);
+ context.writeLock();
+ try {
+ exitState(context);
+ context.setState(s);
+ s.enterState(context);
+ } finally {
+ context.writeUnlock();
+ }
+ }
+
+ /**
+ * Method to be overridden by subclasses to prepare to enter a state.
+ * This method is called <em>without</em> the context being locked,
+ * and after {@link #prepareToExitState(HAContext)} has been called
+ * for the previous state, but before {@link #exitState(HAContext)}
+ * has been called for the previous state.
+ * @param context HA context
+ * @throws ServiceFailedException on precondition failure
+ */
+ public void prepareToEnterState(final HAContext context)
+ throws ServiceFailedException {}
+
+ /**
+ * Method to be overridden by subclasses to perform steps necessary for
+ * entering a state.
+ * @param context HA context
+ * @throws ServiceFailedException on failure to enter the state.
+ */
+ public abstract void enterState(final HAContext context)
+ throws ServiceFailedException;
+
+ /**
+ * Method to be overridden by subclasses to prepare to exit a state.
+ * This method is called <em>without</em> the context being locked.
+ * This is used by the standby state to cancel any checkpoints
+ * that are going on. It can also be used to check any preconditions
+ * for the state transition.
+ *
+ * This method should not make any destructuve changes to the state
+ * (eg stopping threads) since {@link #prepareToEnterState(HAContext)}
+ * may subsequently cancel the state transition.
+ * @param context HA context
+ * @throws ServiceFailedException on precondition failure
+ */
+ public void prepareToExitState(final HAContext context)
+ throws ServiceFailedException {}
+
+ /**
+ * Method to be overridden by subclasses to perform steps necessary for
+ * exiting a state.
+ * @param context HA context
+ * @throws ServiceFailedException on failure to enter the state.
+ */
+ public abstract void exitState(final HAContext context)
+ throws ServiceFailedException;
+
+ /**
+ * Move from the existing state to a new state
+ * @param context HA context
+ * @param s new state
+ * @throws ServiceFailedException on failure to transition to new state.
+ */
+ public void setState(HAContext context, HAState s) throws ServiceFailedException {
+ if (this == s) { // Aleady in the new state
+ return;
+ }
+ throw new ServiceFailedException("Transtion from state " + this + " to "
+ + s + " is not allowed.");
+ }
+
+ /**
+ * Check if an operation is supported in a given state.
+ * @param context HA context
+ * @param op Type of the operation.
+ * @throws UnsupportedActionException if a given type of operation is not
+ * supported in this state.
+ */
+ public abstract void checkOperation(final HAContext context, final OperationCategory op)
+ throws StandbyException;
+
+ public abstract boolean shouldPopulateReplQueues();
+
+ /**
+ * @return String representation of the service state.
+ */
+ @Override
+ public String toString() {
+ return state.toString();
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java
new file mode 100644
index 0000000..036dd43
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java
@@ -0,0 +1,299 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.security.PrivilegedAction;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.server.namenode.CheckpointConf;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceCancelledException;
+import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UserGroupInformation;
+import static org.apache.hadoop.hdfs.server.common.Util.now;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+
+/**
+ * Thread which runs inside the NN when it's in Standby state,
+ * periodically waking up to take a checkpoint of the namespace.
+ * When it takes a checkpoint, it saves it to its local
+ * storage and then uploads it to the remote NameNode.
+ */
+@InterfaceAudience.Private
+public class StandbyCheckpointer {
+ private static final Log LOG = LogFactory.getLog(StandbyCheckpointer.class);
+ private static final long PREVENT_AFTER_CANCEL_MS = 2*60*1000L;
+ private final CheckpointConf checkpointConf;
+ private final FSNamesystem namesystem;
+ private long lastCheckpointTime;
+ private final CheckpointerThread thread;
+ private String activeNNAddress;
+ private InetSocketAddress myNNAddress;
+
+ // Keep track of how many checkpoints were canceled.
+ // This is for use in tests.
+ private static int canceledCount = 0;
+
+ public StandbyCheckpointer(Configuration conf, FSNamesystem ns) {
+ this.namesystem = ns;
+ this.checkpointConf = new CheckpointConf(conf);
+ this.thread = new CheckpointerThread();
+
+ setNameNodeAddresses(conf);
+ }
+
+ /**
+ * Determine the address of the NN we are checkpointing
+ * as well as our own HTTP address from the configuration.
+ */
+ private void setNameNodeAddresses(Configuration conf) {
+ // Look up our own address.
+ String myAddrString = getHttpAddress(conf);
+
+ // Look up the active node's address
+ Configuration confForActive = HAUtil.getConfForOtherNode(conf);
+ activeNNAddress = getHttpAddress(confForActive);
+
+ // Sanity-check.
+ Preconditions.checkArgument(checkAddress(activeNNAddress),
+ "Bad address for active NN: %s", activeNNAddress);
+ Preconditions.checkArgument(checkAddress(myAddrString),
+ "Bad address for standby NN: %s", myAddrString);
+ myNNAddress = NetUtils.createSocketAddr(myAddrString);
+ }
+
+ private String getHttpAddress(Configuration conf) {
+ String configuredAddr = DFSUtil.getInfoServer(null, conf, true);
+
+ // Use the hostname from the RPC address as a default, in case
+ // the HTTP address is configured to 0.0.0.0.
+ String hostnameFromRpc = NameNode.getServiceAddress(
+ conf, true).getHostName();
+ try {
+ return DFSUtil.substituteForWildcardAddress(
+ configuredAddr, hostnameFromRpc);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ /**
+ * Ensure that the given address is valid and has a port
+ * specified.
+ */
+ private boolean checkAddress(String addrStr) {
+ InetSocketAddress addr = NetUtils.createSocketAddr(addrStr);
+ return addr.getPort() != 0 && !addr.getAddress().isAnyLocalAddress();
+ }
+
+ public void start() {
+ LOG.info("Starting standby checkpoint thread...\n" +
+ "Checkpointing active NN at " + activeNNAddress + "\n" +
+ "Serving checkpoints at " + myNNAddress);
+ thread.start();
+ }
+
+ public void stop() throws IOException {
+ thread.setShouldRun(false);
+ thread.interrupt();
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ LOG.warn("Edit log tailer thread exited with an exception");
+ throw new IOException(e);
+ }
+ }
+
+ private void doCheckpoint() throws InterruptedException, IOException {
+ long txid;
+
+ namesystem.writeLockInterruptibly();
+ try {
+ assert namesystem.getEditLog().isOpenForRead() :
+ "Standby Checkpointer should only attempt a checkpoint when " +
+ "NN is in standby mode, but the edit logs are in an unexpected state";
+
+ FSImage img = namesystem.getFSImage();
+
+ long prevCheckpointTxId = img.getStorage().getMostRecentCheckpointTxId();
+ long thisCheckpointTxId = img.getLastAppliedOrWrittenTxId();
+ assert thisCheckpointTxId >= prevCheckpointTxId;
+ if (thisCheckpointTxId == prevCheckpointTxId) {
+ LOG.info("A checkpoint was triggered but the Standby Node has not " +
+ "received any transactions since the last checkpoint at txid " +
+ thisCheckpointTxId + ". Skipping...");
+ return;
+ }
+
+ img.saveNamespace(namesystem);
+ txid = img.getStorage().getMostRecentCheckpointTxId();
+ assert txid == thisCheckpointTxId : "expected to save checkpoint at txid=" +
+ thisCheckpointTxId + " but instead saved at txid=" + txid;
+ } finally {
+ namesystem.writeUnlock();
+ }
+
+ // Upload the saved checkpoint back to the active
+ TransferFsImage.uploadImageFromStorage(
+ activeNNAddress, myNNAddress,
+ namesystem.getFSImage().getStorage(), txid);
+ }
+
+ /**
+ * Cancel any checkpoint that's currently being made,
+ * and prevent any new checkpoints from starting for the next
+ * minute or so.
+ */
+ public void cancelAndPreventCheckpoints() throws ServiceFailedException {
+ try {
+ thread.preventCheckpointsFor(PREVENT_AFTER_CANCEL_MS);
+ // TODO(HA): there is a really narrow race here if we are just
+ // about to start a checkpoint - this won't cancel it!
+ namesystem.getFSImage().cancelSaveNamespace(
+ "About to exit standby state");
+ } catch (InterruptedException e) {
+ throw new ServiceFailedException(
+ "Interrupted while trying to cancel checkpoint");
+ }
+ }
+
+ @VisibleForTesting
+ static int getCanceledCount() {
+ return canceledCount;
+ }
+
+ private long countUncheckpointedTxns() {
+ FSImage img = namesystem.getFSImage();
+ return img.getLastAppliedOrWrittenTxId() -
+ img.getStorage().getMostRecentCheckpointTxId();
+ }
+
+ private class CheckpointerThread extends Thread {
+ private volatile boolean shouldRun = true;
+ private volatile long preventCheckpointsUntil = 0;
+
+ private CheckpointerThread() {
+ super("Standby State Checkpointer");
+ }
+
+ private void setShouldRun(boolean shouldRun) {
+ this.shouldRun = shouldRun;
+ }
+
+ @Override
+ public void run() {
+ // We have to make sure we're logged in as far as JAAS
+ // is concerned, in order to use kerberized SSL properly.
+ SecurityUtil.doAsLoginUserOrFatal(
+ new PrivilegedAction<Object>() {
+ @Override
+ public Object run() {
+ doWork();
+ return null;
+ }
+ });
+ }
+
+ /**
+ * Prevent checkpoints from occurring for some time period
+ * in the future. This is used when preparing to enter active
+ * mode. We need to not only cancel any concurrent checkpoint,
+ * but also prevent any checkpoints from racing to start just
+ * after the cancel call.
+ *
+ * @param delayMs the number of MS for which checkpoints will be
+ * prevented
+ */
+ private void preventCheckpointsFor(long delayMs) {
+ preventCheckpointsUntil = now() + delayMs;
+ }
+
+ private void doWork() {
+ // Reset checkpoint time so that we don't always checkpoint
+ // on startup.
+ lastCheckpointTime = now();
+ while (shouldRun) {
+ try {
+ Thread.sleep(1000 * checkpointConf.getCheckPeriod());
+ } catch (InterruptedException ie) {
+ }
+ if (!shouldRun) {
+ break;
+ }
+ try {
+ // We may have lost our ticket since last checkpoint, log in again, just in case
+ if (UserGroupInformation.isSecurityEnabled()) {
+ UserGroupInformation.getCurrentUser().reloginFromKeytab();
+ }
+
+ long now = now();
+ long uncheckpointed = countUncheckpointedTxns();
+ long secsSinceLast = (now - lastCheckpointTime)/1000;
+
+ boolean needCheckpoint = false;
+ if (uncheckpointed >= checkpointConf.getTxnCount()) {
+ LOG.info("Triggering checkpoint because there have been " +
+ uncheckpointed + " txns since the last checkpoint, which " +
+ "exceeds the configured threshold " +
+ checkpointConf.getTxnCount());
+ needCheckpoint = true;
+ } else if (secsSinceLast >= checkpointConf.getPeriod()) {
+ LOG.info("Triggering checkpoint because it has been " +
+ secsSinceLast + " seconds since the last checkpoint, which " +
+ "exceeds the configured interval " + checkpointConf.getPeriod());
+ needCheckpoint = true;
+ }
+ if (needCheckpoint && now < preventCheckpointsUntil) {
+ LOG.info("But skipping this checkpoint since we are about to failover!");
+ canceledCount++;
+ } else if (needCheckpoint) {
+ doCheckpoint();
+ lastCheckpointTime = now;
+ }
+ } catch (SaveNamespaceCancelledException ce) {
+ LOG.info("Checkpoint was cancelled: " + ce.getMessage());
+ canceledCount++;
+ } catch (InterruptedException ie) {
+ // Probably requested shutdown.
+ continue;
+ } catch (Throwable t) {
+ LOG.error("Exception in doCheckpoint", t);
+ }
+ }
+ }
+ }
+
+ @VisibleForTesting
+ String getActiveNNAddress() {
+ return activeNNAddress;
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java
new file mode 100644
index 0000000..60e8371
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyState.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNode.OperationCategory;
+import org.apache.hadoop.ipc.StandbyException;
+
+/**
+ * Namenode standby state. In this state the namenode acts as warm standby and
+ * keeps the following updated:
+ * <ul>
+ * <li>Namespace by getting the edits.</li>
+ * <li>Block location information by receiving block reports and blocks
+ * received from the datanodes.</li>
+ * </ul>
+ *
+ * It does not handle read/write/checkpoint operations.
+ */
+@InterfaceAudience.Private
+public class StandbyState extends HAState {
+ public StandbyState() {
+ super(HAServiceState.STANDBY);
+ }
+
+ @Override
+ public void setState(HAContext context, HAState s) throws ServiceFailedException {
+ if (s == NameNode.ACTIVE_STATE) {
+ setStateInternal(context, s);
+ return;
+ }
+ super.setState(context, s);
+ }
+
+ @Override
+ public void enterState(HAContext context) throws ServiceFailedException {
+ try {
+ context.startStandbyServices();
+ } catch (IOException e) {
+ throw new ServiceFailedException("Failed to start standby services", e);
+ }
+ }
+
+ @Override
+ public void prepareToExitState(HAContext context) throws ServiceFailedException {
+ context.prepareToStopStandbyServices();
+ }
+
+ @Override
+ public void exitState(HAContext context) throws ServiceFailedException {
+ try {
+ context.stopStandbyServices();
+ } catch (IOException e) {
+ throw new ServiceFailedException("Failed to stop standby services", e);
+ }
+ }
+
+ @Override
+ public void checkOperation(HAContext context, OperationCategory op)
+ throws StandbyException {
+ if (op == OperationCategory.UNCHECKED ||
+ (op == OperationCategory.READ && context.allowStaleReads())) {
+ return;
+ }
+ String msg = "Operation category " + op + " is not supported in state "
+ + context.getState();
+ throw new StandbyException(msg);
+ }
+
+ @Override
+ public boolean shouldPopulateReplQueues() {
+ return false;
+ }
+}
+
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java
index c9c7150..5669497 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/DatanodeProtocol.java
@@ -22,8 +22,8 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.DFSConfigKeys;
-import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.security.KerberosInfo;
@@ -93,7 +93,7 @@
* sendHeartbeat() tells the NameNode that the DataNode is still
* alive and well. Includes some status info, too.
* It also gives the NameNode a chance to return
- * an array of "DatanodeCommand" objects.
+ * an array of "DatanodeCommand" objects in HeartbeatResponse.
* A DatanodeCommand tells the DataNode to invalidate local block(s),
* or to copy them to other DataNodes, etc.
* @param registration datanode registration information
@@ -103,7 +103,7 @@
* @param failedVolumes number of failed volumes
* @throws IOException on error
*/
- public DatanodeCommand[] sendHeartbeat(DatanodeRegistration registration,
+ public HeartbeatResponse sendHeartbeat(DatanodeRegistration registration,
StorageReport[] reports,
int xmitsInProgress,
int xceiverCount,
@@ -118,7 +118,8 @@
* @param registration
* @param poolId - the block pool ID for the blocks
* @param reports - report of blocks per storage
- * Each block is represented as 2 longs.
+ * Each finalized block is represented as 3 longs. Each under-
+ * construction replica is represented as 4 longs.
* This is done instead of Block[] to reduce memory used by block reports.
*
* @return - the next command for DN to process.
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java
new file mode 100644
index 0000000..96f74a0
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/HeartbeatResponse.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.protocol;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+/**
+ * Response to {@link DatanodeProtocol#sendHeartbeat}
+ */
+public class HeartbeatResponse implements Writable {
+ /** Commands returned from the namenode to the datanode */
+ private DatanodeCommand[] commands;
+
+ /** Information about the current HA-related state of the NN */
+ private NNHAStatusHeartbeat haStatus;
+
+ public HeartbeatResponse() {
+ // Empty constructor required for Writable
+ }
+
+ public HeartbeatResponse(DatanodeCommand[] cmds,
+ NNHAStatusHeartbeat haStatus) {
+ commands = cmds;
+ this.haStatus = haStatus;
+ }
+
+ public DatanodeCommand[] getCommands() {
+ return commands;
+ }
+
+ public NNHAStatusHeartbeat getNameNodeHaState() {
+ return haStatus;
+ }
+
+ ///////////////////////////////////////////
+ // Writable
+ ///////////////////////////////////////////
+ @Override
+ public void write(DataOutput out) throws IOException {
+ int length = commands == null ? 0 : commands.length;
+ out.writeInt(length);
+ for (int i = 0; i < length; i++) {
+ ObjectWritable.writeObject(out, commands[i], commands[i].getClass(),
+ null, true);
+ }
+ haStatus.write(out);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int length = in.readInt();
+ commands = new DatanodeCommand[length];
+ ObjectWritable objectWritable = new ObjectWritable();
+ for (int i = 0; i < length; i++) {
+ commands[i] = (DatanodeCommand) ObjectWritable.readObject(in,
+ objectWritable, null);
+ }
+ haStatus = new NNHAStatusHeartbeat();
+ haStatus.readFields(in);
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java
new file mode 100644
index 0000000..633aa85
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NNHAStatusHeartbeat.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.protocol;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class NNHAStatusHeartbeat implements Writable {
+
+ private State state;
+ private long txid = HdfsConstants.INVALID_TXID;
+
+ public NNHAStatusHeartbeat() {
+ }
+
+ public NNHAStatusHeartbeat(State state, long txid) {
+ this.state = state;
+ this.txid = txid;
+ }
+
+ public State getState() {
+ return state;
+ }
+
+ public long getTxId() {
+ return txid;
+ }
+
+ ///////////////////////////////////////////
+ // Writable
+ ///////////////////////////////////////////
+ @Override
+ public void write(DataOutput out) throws IOException {
+ WritableUtils.writeEnum(out, state);
+ out.writeLong(txid);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ state = WritableUtils.readEnum(in, State.class);
+ txid = in.readLong();
+ }
+
+ @InterfaceAudience.Private
+ public enum State {
+ ACTIVE,
+ STANDBY;
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java
index 59b279c..a75308a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java
@@ -101,10 +101,7 @@
* call fails if the file system is in SafeMode.
* @throws IOException
* @return a unique token to identify this transaction.
- * @deprecated
- * See {@link org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode}
*/
- @Deprecated
public CheckpointSignature rollEditLog() throws IOException;
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java
index 4de386f..e05b8fe 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocols.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hdfs.server.protocol;
import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
import org.apache.hadoop.security.RefreshUserMappingsProtocol;
@@ -32,5 +33,6 @@
NamenodeProtocol,
RefreshAuthorizationPolicyProtocol,
RefreshUserMappingsProtocol,
- GetUserMappingsProtocol {
+ GetUserMappingsProtocol,
+ HAServiceProtocol {
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java
index 45014ad..bde5a5e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/protocol/ReceivedDeletedBlockInfo.java
@@ -25,22 +25,47 @@
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
/**
- * A data structure to store Block and delHints together, used to send
- * received/deleted ACKs.
+ * A data structure to store the blocks in an incremental block report.
*/
public class ReceivedDeletedBlockInfo implements Writable {
Block block;
+ BlockStatus status;
String delHints;
- public final static String TODELETE_HINT = "-";
+ public static enum BlockStatus {
+ RECEIVING_BLOCK(1),
+ RECEIVED_BLOCK(2),
+ DELETED_BLOCK(3);
+
+ private final int code;
+ BlockStatus(int code) {
+ this.code = code;
+ }
+
+ public int getCode() {
+ return code;
+ }
+
+ public static BlockStatus fromCode(int code) {
+ for (BlockStatus bs : BlockStatus.values()) {
+ if (bs.code == code) {
+ return bs;
+ }
+ }
+ return null;
+ }
+ }
public ReceivedDeletedBlockInfo() {
}
- public ReceivedDeletedBlockInfo(Block blk, String delHints) {
+ public ReceivedDeletedBlockInfo(
+ Block blk, BlockStatus status, String delHints) {
this.block = blk;
+ this.status = status;
this.delHints = delHints;
}
@@ -60,13 +85,19 @@
this.delHints = hints;
}
+ public BlockStatus getStatus() {
+ return status;
+ }
+
public boolean equals(Object o) {
if (!(o instanceof ReceivedDeletedBlockInfo)) {
return false;
}
ReceivedDeletedBlockInfo other = (ReceivedDeletedBlockInfo) o;
return this.block.equals(other.getBlock())
- && this.delHints.equals(other.delHints);
+ && this.status == other.status
+ && (this.delHints == other.delHints ||
+ this.delHints != null && this.delHints.equals(other.delHints));
}
public int hashCode() {
@@ -79,23 +110,30 @@
}
public boolean isDeletedBlock() {
- return delHints.equals(TODELETE_HINT);
+ return status == BlockStatus.DELETED_BLOCK;
}
@Override
public void write(DataOutput out) throws IOException {
this.block.write(out);
- Text.writeString(out, this.delHints);
+ WritableUtils.writeVInt(out, this.status.code);
+ if (this.status == BlockStatus.DELETED_BLOCK) {
+ Text.writeString(out, this.delHints);
+ }
}
@Override
public void readFields(DataInput in) throws IOException {
this.block = new Block();
this.block.readFields(in);
- this.delHints = Text.readString(in);
+ this.status = BlockStatus.fromCode(WritableUtils.readVInt(in));
+ if (this.status == BlockStatus.DELETED_BLOCK) {
+ this.delHints = Text.readString(in);
+ }
}
public String toString() {
- return block.toString() + ", delHint: " + delHints;
+ return block.toString() + ", status: " + status +
+ ", delHint: " + delHints;
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java
index 1025880..edbbb22 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSAdmin.java
@@ -38,19 +38,20 @@
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.UpgradeAction;
-import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB;
-import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.RefreshUserMappingsProtocol;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
@@ -791,9 +792,9 @@
conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, ""));
// Create the client
- RefreshAuthorizationPolicyProtocolClientSideTranslatorPB refreshProtocol =
- new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB(
- NameNode.getAddress(conf), getUGI(), conf);
+ RefreshAuthorizationPolicyProtocol refreshProtocol =
+ NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf),
+ RefreshAuthorizationPolicyProtocol.class).getProxy();
// Refresh the authorization policy in-effect
refreshProtocol.refreshServiceAcl();
@@ -817,9 +818,9 @@
conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, ""));
// Create the client
- RefreshUserMappingsProtocolClientSideTranslatorPB refreshProtocol =
- new RefreshUserMappingsProtocolClientSideTranslatorPB(
- NameNode.getAddress(conf), getUGI(), conf);
+ RefreshUserMappingsProtocol refreshProtocol =
+ NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf),
+ RefreshUserMappingsProtocol.class).getProxy();
// Refresh the user-to-groups mappings
refreshProtocol.refreshUserToGroupsMappings();
@@ -844,9 +845,9 @@
conf.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, ""));
// Create the client
- RefreshUserMappingsProtocolClientSideTranslatorPB refreshProtocol =
- new RefreshUserMappingsProtocolClientSideTranslatorPB(
- NameNode.getAddress(conf), getUGI(), conf);
+ RefreshUserMappingsProtocol refreshProtocol =
+ NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf),
+ RefreshUserMappingsProtocol.class).getProxy();
// Refresh the user-to-groups mappings
refreshProtocol.refreshSuperUserGroupsConfiguration();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java
new file mode 100644
index 0000000..13bde2a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools;
+
+import java.io.PrintStream;
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.ha.HAAdmin;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * Class to extend HAAdmin to do a little bit of HDFS-specific configuration.
+ */
+public class DFSHAAdmin extends HAAdmin {
+
+ private static final Log LOG = LogFactory.getLog(DFSHAAdmin.class);
+
+ private String nameserviceId;
+
+ protected void setErrOut(PrintStream errOut) {
+ this.errOut = errOut;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ if (conf != null) {
+ // Make a copy so we don't mutate it. Also use an HdfsConfiguration to
+ // force loading of hdfs-site.xml.
+ conf = new HdfsConfiguration(conf);
+ String nameNodePrincipal = conf.get(
+ DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "");
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Using NN principal: " + nameNodePrincipal);
+ }
+
+ conf.set(CommonConfigurationKeys.HADOOP_SECURITY_SERVICE_USER_NAME_KEY,
+ nameNodePrincipal);
+ }
+ super.setConf(conf);
+ }
+
+ /**
+ * Try to map the given namenode ID to its service address.
+ */
+ @Override
+ protected String getServiceAddr(String nnId) {
+ HdfsConfiguration conf = (HdfsConfiguration)getConf();
+ String serviceAddr =
+ DFSUtil.getNamenodeServiceAddr(conf, nameserviceId, nnId);
+ if (serviceAddr == null) {
+ throw new IllegalArgumentException(
+ "Unable to determine service address for namenode '" + nnId + "'");
+ }
+ return serviceAddr;
+ }
+
+ @Override
+ protected String getUsageString() {
+ return "Usage: DFSHAAdmin [-ns <nameserviceId>]";
+ }
+
+ @Override
+ protected int runCmd(String[] argv) throws Exception {
+ if (argv.length < 1) {
+ printUsage(errOut);
+ return -1;
+ }
+
+ int i = 0;
+ String cmd = argv[i++];
+
+ if ("-ns".equals(cmd)) {
+ if (i == argv.length) {
+ errOut.println("Missing nameservice ID");
+ printUsage(errOut);
+ return -1;
+ }
+ nameserviceId = argv[i++];
+ if (i >= argv.length) {
+ errOut.println("Missing command");
+ printUsage(errOut);
+ return -1;
+ }
+ argv = Arrays.copyOfRange(argv, i, argv.length);
+ }
+
+ return super.runCmd(argv);
+ }
+
+ public static void main(String[] argv) throws Exception {
+ int res = ToolRunner.run(new DFSHAAdmin(), argv);
+ System.exit(res);
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java
index bc98995..1a99fcb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSck.java
@@ -32,11 +32,13 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
-import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.server.namenode.NamenodeFsck;
+import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.security.Krb5AndCertsSslSocketConnector;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
@@ -204,8 +206,9 @@
* Derive the namenode http address from the current file system,
* either default or as set by "-fs" in the generic options.
* @return Returns http address or null if failure.
+ * @throws IOException if we can't determine the active NN address
*/
- private String getCurrentNamenodeAddress() {
+ private String getCurrentNamenodeAddress() throws IOException {
//String nnAddress = null;
Configuration conf = getConf();
@@ -222,16 +225,21 @@
System.err.println("FileSystem is " + fs.getUri());
return null;
}
- DistributedFileSystem dfs = (DistributedFileSystem) fs;
-
- // Derive the nameservice ID from the filesystem URI.
- // The URI may have been provided by a human, and the server name may be
- // aliased, so compare InetSocketAddresses instead of URI strings, and
- // test against both possible variants of RPC address.
- InetSocketAddress namenode =
- NameNode.getAddress(dfs.getUri().getAuthority());
- return DFSUtil.getInfoServer(namenode, conf, true);
+ // force client address resolution.
+ fs.exists(new Path("/"));
+
+ // Derive the nameservice ID from the filesystem connection. The URI may
+ // have been provided by a human, the server name may be aliased, or there
+ // may be multiple possible actual addresses (e.g. in an HA setup) so
+ // compare InetSocketAddresses instead of URI strings, and test against both
+ // possible configurations of RPC address (DFS_NAMENODE_RPC_ADDRESS_KEY and
+ // DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY).
+ DistributedFileSystem dfs = (DistributedFileSystem) fs;
+ DFSClient dfsClient = dfs.getClient();
+ InetSocketAddress addr = RPC.getServerAddress(dfsClient.getNamenode());
+
+ return DFSUtil.getInfoServer(addr, conf, true);
}
private int doWork(final String[] args) throws IOException {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java
index 0c751773..e3a67ed 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetConf.java
@@ -29,6 +29,7 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@@ -164,7 +165,7 @@
static class NameNodesCommandHandler extends CommandHandler {
@Override
int doWorkInternal(GetConf tool) throws IOException {
- tool.printList(DFSUtil.getNNServiceRpcAddresses(tool.getConf()));
+ tool.printMap(DFSUtil.getNNServiceRpcAddresses(tool.getConf()));
return 0;
}
}
@@ -175,7 +176,7 @@
static class BackupNodesCommandHandler extends CommandHandler {
@Override
public int doWorkInternal(GetConf tool) throws IOException {
- tool.printList(DFSUtil.getBackupNodeAddresses(tool.getConf()));
+ tool.printMap(DFSUtil.getBackupNodeAddresses(tool.getConf()));
return 0;
}
}
@@ -186,7 +187,7 @@
static class SecondaryNameNodesCommandHandler extends CommandHandler {
@Override
public int doWorkInternal(GetConf tool) throws IOException {
- tool.printList(DFSUtil.getSecondaryNameNodeAddresses(tool.getConf()));
+ tool.printMap(DFSUtil.getSecondaryNameNodeAddresses(tool.getConf()));
return 0;
}
}
@@ -200,9 +201,11 @@
@Override
public int doWorkInternal(GetConf tool) throws IOException {
Configuration config = tool.getConf();
- List<InetSocketAddress> rpclist = DFSUtil.getNNServiceRpcAddresses(config);
- if (rpclist != null) {
- for (InetSocketAddress rpc : rpclist) {
+ List<ConfiguredNNAddress> cnnlist = DFSUtil.flattenAddressMap(
+ DFSUtil.getNNServiceRpcAddresses(config));
+ if (!cnnlist.isEmpty()) {
+ for (ConfiguredNNAddress cnn : cnnlist) {
+ InetSocketAddress rpc = cnn.getAddress();
tool.printOut(rpc.getHostName()+":"+rpc.getPort());
}
return 0;
@@ -232,10 +235,13 @@
void printOut(String message) {
out.println(message);
}
-
- void printList(List<InetSocketAddress> list) {
+
+ void printMap(Map<String, Map<String, InetSocketAddress>> map) {
StringBuilder buffer = new StringBuilder();
- for (InetSocketAddress address : list) {
+
+ List<ConfiguredNNAddress> cnns = DFSUtil.flattenAddressMap(map);
+ for (ConfiguredNNAddress cnn : cnns) {
+ InetSocketAddress address = cnn.getAddress();
if (buffer.length() > 0) {
buffer.append(" ");
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java
index 5ad227d..51612be 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/GetGroups.java
@@ -21,8 +21,11 @@
import java.io.PrintStream;
import java.net.InetSocketAddress;
+import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocolPB.GetUserMappingsProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.security.UserGroupInformation;
@@ -34,6 +37,7 @@
* HDFS implementation of a tool for getting the groups which a given user
* belongs to.
*/
+@InterfaceAudience.Private
public class GetGroups extends GetGroupsBase {
static{
@@ -41,11 +45,11 @@
}
- GetGroups(Configuration conf) {
+ public GetGroups(Configuration conf) {
super(conf);
}
- GetGroups(Configuration conf, PrintStream out) {
+ public GetGroups(Configuration conf, PrintStream out) {
super(conf, out);
}
@@ -57,9 +61,8 @@
@Override
protected GetUserMappingsProtocol getUgmProtocol() throws IOException {
- return new GetUserMappingsProtocolClientSideTranslatorPB(
- NameNode.getAddress(getConf()), UserGroupInformation.getCurrentUser(),
- getConf());
+ return NameNodeProxies.createProxy(getConf(), FileSystem.getDefaultUri(getConf()),
+ GetUserMappingsProtocol.class).getProxy();
}
public static void main(String[] argv) throws Exception {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java
index a010830..3a460e0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java
@@ -48,6 +48,8 @@
BLOCK_ID,
BLOCK_NUM_BYTES,
BLOCK_GENERATION_STAMP,
+ BLOCK_DELTA_NUM_BYTES, // delta-encoded relative to previous block
+ BLOCK_DELTA_GEN_STAMP, // delta-encoded relative to previous block
PERMISSION_STATUS,
FS_PERMISSIONS,
CLIENT_NAME,
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java
index f1da4c6..d34bff9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java
@@ -41,7 +41,7 @@
class EditsLoaderCurrent implements EditsLoader {
private static int[] supportedVersions = { -18, -19, -20, -21, -22, -23, -24,
- -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39};
+ -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40};
private EditsVisitor v;
private int editsVersion = 0;
@@ -150,6 +150,25 @@
}
}
+ private void visit_OP_UPDATE_BLOCKS() throws IOException {
+ visitTxId();
+ v.visitStringUTF8(EditsElement.PATH);
+ VIntToken numBlocksToken = v.visitVInt(EditsElement.NUMBLOCKS);
+ for (int i = 0; i < numBlocksToken.value; i++) {
+ v.visitEnclosingElement(EditsElement.BLOCK);
+
+ v.visitLong(EditsElement.BLOCK_ID);
+ if (i == 0) {
+ v.visitVLong(EditsElement.BLOCK_NUM_BYTES);
+ v.visitVLong(EditsElement.BLOCK_GENERATION_STAMP);
+ } else {
+ v.visitVLong(EditsElement.BLOCK_DELTA_NUM_BYTES);
+ v.visitVLong(EditsElement.BLOCK_DELTA_GEN_STAMP);
+ }
+ v.leaveEnclosingElement();
+ }
+ }
+
/**
* Visit OP_RENAME_OLD
*/
@@ -521,6 +540,9 @@
case OP_START_LOG_SEGMENT: // 24
visit_OP_BEGIN_LOG_SEGMENT();
break;
+ case OP_UPDATE_BLOCKS: // 25
+ visit_OP_UPDATE_BLOCKS();
+ break;
default:
{
throw new IOException("Unknown op code " + editsOpCode);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java
index 8960cbc..fdc9892 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java
@@ -122,7 +122,8 @@
protected final DateFormat dateFormat =
new SimpleDateFormat("yyyy-MM-dd HH:mm");
private static int[] versions = { -16, -17, -18, -19, -20, -21, -22, -23,
- -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39};
+ -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39,
+ -40};
private int imageVersion = 0;
/* (non-Javadoc)
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java
index cd88963..8a0f992 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java
@@ -30,6 +30,9 @@
/** Acquire write lock. */
public void writeLock();
+
+ /** Acquire write lock, unless interrupted while waiting */
+ void writeLockInterruptibly() throws InterruptedException;
/** Release write lock. */
public void writeUnlock();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto
index 81ca74b..2a96544 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/proto/DatanodeProtocol.proto
@@ -189,10 +189,25 @@
}
/**
+ * state - State the NN is in when returning response to the DN
+ * txid - Highest transaction ID this NN has seen
+ */
+message NNHAStatusHeartbeatProto {
+ enum State {
+ ACTIVE = 0;
+ STANDBY = 1;
+ }
+ required State state = 1;
+ required uint64 txid = 2;
+}
+
+/**
* cmds - Commands from namenode to datanode.
+ * haStatus - Status (from an HA perspective) of the NN sending this response
*/
message HeartbeatResponseProto {
repeated DatanodeCommandProto cmds = 1; // Returned commands can be null
+ required NNHAStatusHeartbeatProto haStatus = 2;
}
/**
@@ -226,12 +241,16 @@
/**
* Data structure to send received or deleted block information
* from datanode to namenode.
- *
- * deleteHint set to "-" indicates block deletion.
- * other deleteHint indicates block addition.
*/
message ReceivedDeletedBlockInfoProto {
+ enum BlockStatus {
+ RECEIVING = 1; // block being created
+ RECEIVED = 2; // block creation complete
+ DELETED = 3;
+ }
+
required BlockProto block = 1;
+ required BlockStatus status = 3;
optional string deleteHint = 2;
}
@@ -350,7 +369,9 @@
rpc blockReport(BlockReportRequestProto) returns(BlockReportResponseProto);
/**
- * Report from datanode about recently received or deleted block
+ * Incremental block report from the DN. This contains info about recently
+ * received and deleted blocks, as well as when blocks start being
+ * received.
*/
rpc blockReceivedAndDeleted(BlockReceivedAndDeletedRequestProto)
returns(BlockReceivedAndDeletedResponseProto);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
index 1ce090b..9ec25d2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
@@ -33,9 +33,11 @@
<property>
<name>dfs.namenode.logging.level</name>
<value>info</value>
- <description>The logging level for dfs namenode. Other values are "dir"(trac
-e namespace mutations), "block"(trace block under/over replications and block
-creations/deletions), or "all".</description>
+ <description>
+ The logging level for dfs namenode. Other values are "dir" (trace
+ namespace mutations), "block" (trace block under/over replications
+ and block creations/deletions), or "all".
+ </description>
</property>
<property>
@@ -226,6 +228,18 @@
directories, for redundancy. Default value is same as dfs.namenode.name.dir
</description>
</property>
+
+<property>
+ <name>dfs.namenode.shared.edits.dir</name>
+ <value></value>
+ <description>A directory on shared storage between the multiple namenodes
+ in an HA cluster. This directory will be written by the active and read
+ by the standby in order to keep the namespaces synchronized. This directory
+ does not need to be listed in dfs.namenode.edits.dir above. It should be
+ left empty in a non-HA cluster.
+ </description>
+</property>
+
<property>
<name>dfs.web.ugi</name>
<value>webuser,webgroup</value>
@@ -624,6 +638,19 @@
</property>
<property>
+ <name>dfs.namenode.num.extra.edits.retained</name>
+ <value>1000000</value>
+ <description>The number of extra transactions which should be retained
+ beyond what is minimally necessary for a NN restart. This can be useful for
+ audit purposes or for an HA setup where a remote Standby Node may have
+ been offline for some time and need to have a longer backlog of retained
+ edits in order to start again.
+ Typically each edit is on the order of a few hundred bytes, so the default
+ of 1 million edits should be on the order of hundreds of MBs or low GBs.
+ </description>
+</property>
+
+<property>
<name>dfs.namenode.delegation.key.update-interval</name>
<value>86400000</value>
<description>The update interval for master key for delegation tokens
@@ -698,4 +725,118 @@
</description>
</property>
+<property>
+ <name>dfs.client.failover.max.attempts</name>
+ <value>15</value>
+ <description>
+ Expert only. The number of client failover attempts that should be
+ made before the failover is considered failed.
+ </description>
+</property>
+
+<property>
+ <name>dfs.client.failover.sleep.base.millis</name>
+ <value>500</value>
+ <description>
+ Expert only. The time to wait, in milliseconds, between failover
+ attempts increases exponentially as a function of the number of
+ attempts made so far, with a random factor of +/- 50%. This option
+ specifies the base value used in the failover calculation. The
+ first failover will retry immediately. The 2nd failover attempt
+ will delay at least dfs.client.failover.sleep.base.millis
+ milliseconds. And so on.
+ </description>
+</property>
+
+<property>
+ <name>dfs.client.failover.sleep.max.millis</name>
+ <value>15000</value>
+ <description>
+ Expert only. The time to wait, in milliseconds, between failover
+ attempts increases exponentially as a function of the number of
+ attempts made so far, with a random factor of +/- 50%. This option
+ specifies the maximum value to wait between failovers.
+ Specifically, the time between two failover attempts will not
+ exceed +/- 50% of dfs.client.failover.sleep.max.millis
+ milliseconds.
+ </description>
+</property>
+
+<property>
+ <name>dfs.client.failover.connection.retries</name>
+ <value>0</value>
+ <description>
+ Expert only. Indicates the number of retries a failover IPC client
+ will make to establish a server connection.
+ </description>
+</property>
+
+<property>
+ <name>dfs.client.failover.connection.retries.on.timeouts</name>
+ <value>0</value>
+ <description>
+ Expert only. The number of retry attempts a failover IPC client
+ will make on socket timeout when establishing a server connection.
+ </description>
+</property>
+
+<property>
+ <name>dfs.federation.nameservices</name>
+ <value></value>
+ <description>
+ Comma-separated list of nameservices.
+ </description>
+</property>
+
+<property>
+ <name>dfs.federation.nameservice.id</name>
+ <value></value>
+ <description>
+ The ID of this nameservice. If the nameservice ID is not
+ configured or more than one nameservice is configured for
+ dfs.federation.nameservices it is determined automatically by
+ matching the local node's address with the configured address.
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.namenodes.EXAMPLENAMESERVICE</name>
+ <value></value>
+ <description>
+ The prefix for a given nameservice, contains a comma-separated
+ list of namenodes for a given nameservice (eg EXAMPLENAMESERVICE).
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.namenode.id</name>
+ <value></value>
+ <description>
+ The ID of this namenode. If the namenode ID is not configured it
+ is determined automatically by matching the local node's address
+ with the configured address.
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.log-roll.period</name>
+ <value>120</value>
+ <description>
+ How often, in seconds, the StandbyNode should ask the active to
+ roll edit logs. Since the StandbyNode only reads from finalized
+ log segments, the StandbyNode will only be as up-to-date as how
+ often the logs are rolled. Note that failover triggers a log roll
+ so the StandbyNode will be up to date before it becomes active.
+ </description>
+</property>
+
+<property>
+ <name>dfs.ha.tail-edits.period</name>
+ <value>60</value>
+ <description>
+ How often, in seconds, the StandbyNode should check for new
+ finalized log segments in the shared edits log.
+ </description>
+</property>
+
</configuration>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp
index a71f40f..a4906a5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/corrupt_files.jsp
@@ -23,6 +23,7 @@
import="org.apache.hadoop.fs.FileStatus"
import="org.apache.hadoop.fs.FileUtil"
import="org.apache.hadoop.fs.Path"
+ import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState"
import="java.util.Collection"
import="java.util.Arrays" %>
<%!//for java.io.Serializable
@@ -30,6 +31,8 @@
<%
NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application);
FSNamesystem fsn = nn.getNamesystem();
+ HAServiceState nnHAState = nn.getServiceState();
+ boolean isActive = (nnHAState == HAServiceState.ACTIVE);
String namenodeRole = nn.getRole().toString();
String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":"
+ nn.getNameNodeAddress().getPort();
@@ -46,8 +49,10 @@
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
<%=NamenodeJspHelper.getVersionTable(fsn)%>
<br>
-<b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b>
-<br>
+<% if (isActive) { %>
+ <b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b>
+ <br>
+<% } %>
<b><a href="/logs/"><%=namenodeRole%> Logs</a></b>
<br>
<b><a href=/dfshealth.jsp> Go back to DFS home</a></b>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp
index ecce30a..81e595d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp
@@ -20,6 +20,7 @@
<%@ page
contentType="text/html; charset=UTF-8"
import="org.apache.hadoop.util.ServletUtil"
+ import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState"
%>
<%!
//for java.io.Serializable
@@ -29,7 +30,10 @@
final NamenodeJspHelper.HealthJsp healthjsp = new NamenodeJspHelper.HealthJsp();
NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application);
FSNamesystem fsn = nn.getNamesystem();
+ HAServiceState nnHAState = nn.getServiceState();
+ boolean isActive = (nnHAState == HAServiceState.ACTIVE);
String namenodeRole = nn.getRole().toString();
+ String namenodeState = nnHAState.toString();
String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameNodeAddress().getPort();
%>
@@ -40,10 +44,12 @@
<title>Hadoop <%=namenodeRole%> <%=namenodeLabel%></title>
<body>
-<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
+<h1><%=namenodeRole%> '<%=namenodeLabel%>' (<%=namenodeState%>)</h1>
<%= NamenodeJspHelper.getVersionTable(fsn) %>
<br />
-<b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b><br>
+<% if (isActive) { %>
+ <b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b><br>
+<% } %>
<b><a href="/logs/"><%=namenodeRole%> Logs</a></b>
<hr>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp
index 886fbea..35deb05 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfsnodelist.jsp
@@ -20,6 +20,7 @@
<%@ page
contentType="text/html; charset=UTF-8"
import="org.apache.hadoop.util.ServletUtil"
+ import="org.apache.hadoop.ha.HAServiceProtocol.HAServiceState"
%>
<%!
//for java.io.Serializable
@@ -30,6 +31,8 @@
NameNode nn = NameNodeHttpServer.getNameNodeFromContext(application);
String namenodeRole = nn.getRole().toString();
FSNamesystem fsn = nn.getNamesystem();
+HAServiceState nnHAState = nn.getServiceState();
+boolean isActive = (nnHAState == HAServiceState.ACTIVE);
String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":" + nn.getNameNodeAddress().getPort();
%>
@@ -43,7 +46,9 @@
<h1><%=namenodeRole%> '<%=namenodeLabel%>'</h1>
<%= NamenodeJspHelper.getVersionTable(fsn) %>
<br />
-<b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b><br>
+<% if (isActive) { %>
+ <b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b><br>
+<% } %>
<b><a href="/logs/"><%=namenodeRole%> Logs</a></b><br>
<b><a href=/dfshealth.jsp> Go back to DFS home</a></b>
<hr>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java
index 17608ac..1d5def6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/TestResolveHdfsSymlink.java
@@ -23,6 +23,7 @@
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
@@ -47,9 +48,11 @@
@BeforeClass
public static void setUp() throws IOException {
Configuration conf = new HdfsConfiguration();
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
cluster = new MiniDFSCluster.Builder(conf).build();
cluster.waitActive();
- NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads();
+
}
@AfterClass
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java
index 83115ed..7ad56c0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFileSystemHdfs.java
@@ -27,8 +27,9 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileSystemTestHelper;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.After;
import org.junit.AfterClass;
@@ -51,12 +52,15 @@
public static void clusterSetupAtBegining() throws IOException,
LoginException, URISyntaxException {
SupportsBlocks = true;
+ CONF.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+
cluster =
- new MiniDFSCluster.Builder(CONF).numNameNodes(2).numDataNodes(2)
+ new MiniDFSCluster.Builder(CONF).nnTopology(
+ MiniDFSNNTopology.simpleFederatedTopology(2))
+ .numDataNodes(2)
.build();
cluster.waitClusterUp();
- NameNodeAdapter.getDtSecretManager(cluster.getNamesystem(0)).startThreads();
- NameNodeAdapter.getDtSecretManager(cluster.getNamesystem(1)).startThreads();
fHdfs = cluster.getFileSystem(0);
fHdfs2 = cluster.getFileSystem(1);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java
index 4a60556..0e94b4e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/viewfs/TestViewFsHdfs.java
@@ -26,9 +26,9 @@
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
@@ -51,9 +51,11 @@
public static void clusterSetupAtBegining() throws IOException,
LoginException, URISyntaxException {
SupportsBlocks = true;
+ CONF.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(2).build();
cluster.waitClusterUp();
- NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads();
fc = FileContext.getFileContext(cluster.getURI(0), CONF);
defaultWorkingDirectory = fc.makeQualified( new Path("/user/" +
UserGroupInformation.getCurrentUser().getShortUserName()));
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java
index 50a34a8..f286481 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/AppendTestUtil.java
@@ -109,12 +109,18 @@
out.write(bytes);
}
- static void check(FileSystem fs, Path p, long length) throws IOException {
+ public static void check(FileSystem fs, Path p, long length) throws IOException {
int i = -1;
try {
final FileStatus status = fs.getFileStatus(p);
- TestCase.assertEquals(length, status.getLen());
- InputStream in = fs.open(p);
+ FSDataInputStream in = fs.open(p);
+ if (in.getWrappedStream() instanceof DFSInputStream) {
+ long len = ((DFSInputStream)in.getWrappedStream()).getFileLength();
+ TestCase.assertEquals(length, len);
+ } else {
+ TestCase.assertEquals(length, status.getLen());
+ }
+
for(i++; i < length; i++) {
TestCase.assertEquals((byte)i, (byte)in.read());
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
index 409dd37..7854f95 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/DFSTestUtil.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.hdfs;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY;
import static org.junit.Assert.assertEquals;
import java.io.BufferedOutputStream;
@@ -38,9 +40,11 @@
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
+import java.util.Set;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.conf.Configuration;
@@ -52,6 +56,7 @@
import org.apache.hadoop.fs.FileSystem.Statistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient.DFSDataInputStream;
+import org.apache.hadoop.hdfs.MiniDFSCluster.NameNodeInfo;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@@ -74,6 +79,8 @@
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
+import com.google.common.base.Joiner;
+
/** Utilities for HDFS tests */
public class DFSTestUtil {
@@ -681,4 +688,21 @@
return BlockOpResponseProto.parseDelimitedFrom(in);
}
+
+ public static void setFederatedConfiguration(MiniDFSCluster cluster,
+ Configuration conf) {
+ Set<String> nameservices = new HashSet<String>();
+ for (NameNodeInfo info : cluster.getNameNodeInfos()) {
+ assert info.nameserviceId != null;
+ nameservices.add(info.nameserviceId);
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY,
+ info.nameserviceId), DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME,
+ info.nameNode.getNameNodeAddress()).toString());
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY,
+ info.nameserviceId), DFSUtil.createUri(HdfsConstants.HDFS_URI_SCHEME,
+ info.nameNode.getNameNodeAddress()).toString());
+ }
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, Joiner.on(",")
+ .join(nameservices));
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
index c3cc6bb..8888bec 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java
@@ -32,6 +32,7 @@
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.List;
import java.util.Random;
import org.apache.commons.logging.Log;
@@ -41,15 +42,25 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.ha.HAServiceProtocolHelper;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.ha.protocolPB.HAServiceProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
+import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
import org.apache.hadoop.hdfs.server.datanode.FSDatasetInterface;
@@ -60,6 +71,7 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.tools.DFSAdmin;
+import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetUtils;
@@ -69,6 +81,11 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.io.Files;
+
/**
* This class creates a single-process DFS cluster for junit testing.
* The data directories for non-simulated DFS are under the testing directory.
@@ -94,7 +111,6 @@
private int nameNodePort = 0;
private int nameNodeHttpPort = 0;
private final Configuration conf;
- private int numNameNodes = 1;
private int numDataNodes = 1;
private boolean format = true;
private boolean manageNameDfsDirs = true;
@@ -106,22 +122,13 @@
private String clusterId = null;
private boolean waitSafeMode = true;
private boolean setupHostsFile = false;
- private boolean federation = false;
+ private MiniDFSNNTopology nnTopology = null;
public Builder(Configuration conf) {
this.conf = conf;
}
/**
- * default false - non federated cluster
- * @param val
- * @return Builder object
- */
- public Builder federation (boolean val){
- this.federation = val;
- return this;
- }
- /**
* Default: 0
*/
public Builder nameNodePort(int val) {
@@ -140,14 +147,6 @@
/**
* Default: 1
*/
- public Builder numNameNodes(int val) {
- this.numNameNodes = val;
- return this;
- }
-
- /**
- * Default: 1
- */
public Builder numDataNodes(int val) {
this.numDataNodes = val;
return this;
@@ -235,6 +234,16 @@
}
/**
+ * Default: a single namenode.
+ * See {@link MiniDFSNNTopology#simpleFederatedTopology(int)} to set up
+ * federated nameservices
+ */
+ public Builder nnTopology(MiniDFSNNTopology topology) {
+ this.nnTopology = topology;
+ return this;
+ }
+
+ /**
* Construct the actual MiniDFSCluster
*/
public MiniDFSCluster build() throws IOException {
@@ -246,15 +255,17 @@
* Used by builder to create and return an instance of MiniDFSCluster
*/
private MiniDFSCluster(Builder builder) throws IOException {
- LOG.info("starting cluster with " + builder.numNameNodes + " namenodes.");
- nameNodes = new NameNodeInfo[builder.numNameNodes];
- // try to determine if in federation mode
- if(builder.numNameNodes > 1)
- builder.federation = true;
+ if (builder.nnTopology == null) {
+ // If no topology is specified, build a single NN.
+ builder.nnTopology = MiniDFSNNTopology.simpleSingleNN(
+ builder.nameNodePort, builder.nameNodeHttpPort);
+ }
+
+ LOG.info("starting cluster with " +
+ builder.nnTopology.countNameNodes() + " namenodes.");
+ nameNodes = new NameNodeInfo[builder.nnTopology.countNameNodes()];
- initMiniDFSCluster(builder.nameNodePort,
- builder.nameNodeHttpPort,
- builder.conf,
+ initMiniDFSCluster(builder.conf,
builder.numDataNodes,
builder.format,
builder.manageNameDfsDirs,
@@ -266,7 +277,7 @@
builder.clusterId,
builder.waitSafeMode,
builder.setupHostsFile,
- builder.federation);
+ builder.nnTopology);
}
public class DataNodeProperties {
@@ -288,8 +299,16 @@
new ArrayList<DataNodeProperties>();
private File base_dir;
private File data_dir;
- private boolean federation = false;
private boolean waitSafeMode = true;
+ private boolean federation;
+
+ /**
+ * A unique instance identifier for the cluster. This
+ * is used to disambiguate HA filesystems in the case where
+ * multiple MiniDFSClusters are used in the same test suite.
+ */
+ private int instanceId;
+ private static int instanceCount = 0;
/**
* Stores the information related to a namenode in the cluster
@@ -297,8 +316,13 @@
static class NameNodeInfo {
final NameNode nameNode;
final Configuration conf;
- NameNodeInfo(NameNode nn, Configuration conf) {
+ final String nameserviceId;
+ final String nnId;
+ NameNodeInfo(NameNode nn, String nameserviceId, String nnId,
+ Configuration conf) {
this.nameNode = nn;
+ this.nameserviceId = nameserviceId;
+ this.nnId = nnId;
this.conf = conf;
}
}
@@ -309,6 +333,9 @@
*/
public MiniDFSCluster() {
nameNodes = new NameNodeInfo[0]; // No namenode in the cluster
+ synchronized (MiniDFSCluster.class) {
+ instanceId = instanceCount++;
+ }
}
/**
@@ -480,22 +507,27 @@
String[] racks, String hosts[],
long[] simulatedCapacities) throws IOException {
this.nameNodes = new NameNodeInfo[1]; // Single namenode in the cluster
- initMiniDFSCluster(nameNodePort, 0, conf, numDataNodes, format,
+ initMiniDFSCluster(conf, numDataNodes, format,
manageNameDfsDirs, manageDataDfsDirs, operation, racks, hosts,
- simulatedCapacities, null, true, false, false);
+ simulatedCapacities, null, true, false,
+ MiniDFSNNTopology.simpleSingleNN(nameNodePort, 0));
}
- private void initMiniDFSCluster(int nameNodePort, int nameNodeHttpPort,
+ private void initMiniDFSCluster(
Configuration conf,
int numDataNodes, boolean format, boolean manageNameDfsDirs,
boolean manageDataDfsDirs, StartupOption operation, String[] racks,
String[] hosts, long[] simulatedCapacities, String clusterId,
- boolean waitSafeMode, boolean setupHostsFile, boolean federation)
+ boolean waitSafeMode, boolean setupHostsFile,
+ MiniDFSNNTopology nnTopology)
throws IOException {
+ synchronized (MiniDFSCluster.class) {
+ instanceId = instanceCount++;
+ }
+
this.conf = conf;
base_dir = new File(determineDfsBaseDir());
data_dir = new File(base_dir, "data");
- this.federation = federation;
this.waitSafeMode = waitSafeMode;
int replication = conf.getInt(DFS_REPLICATION_KEY, 3);
@@ -505,28 +537,25 @@
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
StaticMapping.class, DNSToSwitchMapping.class);
- Collection<String> nameserviceIds = DFSUtil.getNameServiceIds(conf);
- if(nameserviceIds.size() > 1)
- federation = true;
-
- if (!federation) {
- conf.set(FS_DEFAULT_NAME_KEY, "127.0.0.1:" + nameNodePort);
- conf.set(DFS_NAMENODE_HTTP_ADDRESS_KEY, "127.0.0.1:"
- + nameNodeHttpPort);
- NameNode nn = createNameNode(0, conf, numDataNodes, manageNameDfsDirs,
- format, operation, clusterId);
- nameNodes[0] = new NameNodeInfo(nn, conf);
- FileSystem.setDefaultUri(conf, getURI(0));
- } else {
- if (nameserviceIds.isEmpty()) {
- for (int i = 0; i < nameNodes.length; i++) {
- nameserviceIds.add(NAMESERVICE_ID_PREFIX + i);
- }
- }
- initFederationConf(conf, nameserviceIds, numDataNodes, nameNodePort);
- createFederationNamenodes(conf, nameserviceIds, manageNameDfsDirs, format,
- operation, clusterId);
+ // In an HA cluster, in order for the StandbyNode to perform checkpoints,
+ // it needs to know the HTTP port of the Active. So, if ephemeral ports
+ // are chosen, disable checkpoints for the test.
+ if (!nnTopology.allHttpPortsSpecified() &&
+ nnTopology.isHA()) {
+ LOG.info("MiniDFSCluster disabling checkpointing in the Standby node " +
+ "since no HTTP ports have been specified.");
+ conf.setBoolean(DFS_HA_STANDBY_CHECKPOINTS_KEY, false);
}
+ if (!nnTopology.allIpcPortsSpecified() &&
+ nnTopology.isHA()) {
+ LOG.info("MiniDFSCluster disabling log-roll triggering in the "
+ + "Standby node since no IPC ports have been specified.");
+ conf.setInt(DFS_HA_LOGROLL_PERIOD_KEY, -1);
+ }
+
+ federation = nnTopology.isFederated();
+ createNameNodesAndSetConf(
+ nnTopology, manageNameDfsDirs, format, operation, clusterId, conf);
if (format) {
if (data_dir.exists() && !FileUtil.fullyDelete(data_dir)) {
@@ -542,51 +571,130 @@
ProxyUsers.refreshSuperUserGroupsConfiguration(conf);
}
- /** Initialize configuration for federated cluster */
- private static void initFederationConf(Configuration conf,
- Collection<String> nameserviceIds, int numDataNodes, int nnPort) {
- String nameserviceIdList = "";
- for (String nameserviceId : nameserviceIds) {
- // Create comma separated list of nameserviceIds
- if (nameserviceIdList.length() > 0) {
- nameserviceIdList += ",";
+ private void createNameNodesAndSetConf(MiniDFSNNTopology nnTopology,
+ boolean manageNameDfsDirs, boolean format, StartupOption operation,
+ String clusterId,
+ Configuration conf) throws IOException {
+ Preconditions.checkArgument(nnTopology.countNameNodes() > 0,
+ "empty NN topology: no namenodes specified!");
+
+ if (!federation && nnTopology.countNameNodes() == 1) {
+ NNConf onlyNN = nnTopology.getOnlyNameNode();
+ // we only had one NN, set DEFAULT_NAME for it
+ conf.set(FS_DEFAULT_NAME_KEY, "127.0.0.1:" + onlyNN.getIpcPort());
+ }
+
+ List<String> allNsIds = Lists.newArrayList();
+ for (MiniDFSNNTopology.NSConf nameservice : nnTopology.getNameservices()) {
+ if (nameservice.getId() != null) {
+ allNsIds.add(nameservice.getId());
}
- nameserviceIdList += nameserviceId;
- initFederatedNamenodeAddress(conf, nameserviceId, nnPort);
- nnPort = nnPort == 0 ? 0 : nnPort + 2;
}
- conf.set(DFS_FEDERATION_NAMESERVICES, nameserviceIdList);
- }
-
- /* For federated namenode initialize the address:port */
- private static void initFederatedNamenodeAddress(Configuration conf,
- String nameserviceId, int nnPort) {
- // Set nameserviceId specific key
- String key = DFSUtil.getNameServiceIdKey(
- DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId);
- conf.set(key, "127.0.0.1:0");
-
- key = DFSUtil.getNameServiceIdKey(
- DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId);
- conf.set(key, "127.0.0.1:" + nnPort);
- }
-
- private void createFederationNamenodes(Configuration conf,
- Collection<String> nameserviceIds, boolean manageNameDfsDirs,
- boolean format, StartupOption operation, String clusterId)
- throws IOException {
- // Create namenodes in the cluster
+ if (!allNsIds.isEmpty()) {
+ conf.set(DFS_FEDERATION_NAMESERVICES, Joiner.on(",").join(allNsIds));
+ }
+
int nnCounter = 0;
- for (String nameserviceId : nameserviceIds) {
- createFederatedNameNode(nnCounter++, conf, numDataNodes, manageNameDfsDirs,
- format, operation, clusterId, nameserviceId);
+ for (MiniDFSNNTopology.NSConf nameservice : nnTopology.getNameservices()) {
+ String nsId = nameservice.getId();
+
+ Preconditions.checkArgument(
+ !federation || nsId != null,
+ "if there is more than one NS, they must have names");
+
+ // First set up the configuration which all of the NNs
+ // need to have - have to do this a priori before starting
+ // *any* of the NNs, so they know to come up in standby.
+ List<String> nnIds = Lists.newArrayList();
+ // Iterate over the NNs in this nameservice
+ for (NNConf nn : nameservice.getNNs()) {
+ nnIds.add(nn.getNnId());
+
+ initNameNodeAddress(conf, nameservice.getId(), nn);
+ }
+
+ // If HA is enabled on this nameservice, enumerate all the namenodes
+ // in the configuration. Also need to set a shared edits dir
+ if (nnIds.size() > 1) {
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, nameservice.getId()),
+ Joiner.on(",").join(nnIds));
+ if (manageNameDfsDirs) {
+ URI sharedEditsUri = getSharedEditsDir(nnCounter, nnCounter+nnIds.size()-1);
+ conf.set(DFS_NAMENODE_SHARED_EDITS_DIR_KEY, sharedEditsUri.toString());
+ }
+ }
+
+ // Now format first NN and copy the storage directory from that node to the others.
+ int i = 0;
+ Collection<URI> prevNNDirs = null;
+ int nnCounterForFormat = nnCounter;
+ for (NNConf nn : nameservice.getNNs()) {
+ initNameNodeConf(conf, nsId, nn.getNnId(), manageNameDfsDirs,
+ nnCounterForFormat);
+ Collection<URI> namespaceDirs = FSNamesystem.getNamespaceDirs(conf);
+ if (format) {
+ for (URI nameDirUri : namespaceDirs) {
+ File nameDir = new File(nameDirUri);
+ if (nameDir.exists() && !FileUtil.fullyDelete(nameDir)) {
+ throw new IOException("Could not fully delete " + nameDir);
+ }
+ }
+ }
+
+ boolean formatThisOne = format;
+ if (format && i++ > 0) {
+ // Don't format the second NN in an HA setup - that
+ // would result in it having a different clusterID,
+ // block pool ID, etc. Instead, copy the name dirs
+ // from the first one.
+ formatThisOne = false;
+ assert (null != prevNNDirs);
+ copyNameDirs(prevNNDirs, namespaceDirs, conf);
+ }
+
+ nnCounterForFormat++;
+ if (formatThisOne) {
+ DFSTestUtil.formatNameNode(conf);
+ }
+ prevNNDirs = namespaceDirs;
+ }
+
+ // Start all Namenodes
+ for (NNConf nn : nameservice.getNNs()) {
+ initNameNodeConf(conf, nsId, nn.getNnId(), manageNameDfsDirs, nnCounter);
+ createNameNode(nnCounter++, conf, numDataNodes, false, operation,
+ clusterId, nsId, nn.getNnId());
+ }
+
}
+
}
- private NameNode createNameNode(int nnIndex, Configuration conf,
- int numDataNodes, boolean manageNameDfsDirs, boolean format,
- StartupOption operation, String clusterId)
+ public URI getSharedEditsDir(int minNN, int maxNN) throws IOException {
+ return formatSharedEditsDir(base_dir, minNN, maxNN);
+ }
+
+ public static URI formatSharedEditsDir(File baseDir, int minNN, int maxNN)
throws IOException {
+ return fileAsURI(new File(baseDir, "shared-edits-" +
+ minNN + "-through-" + maxNN));
+ }
+
+ public NameNodeInfo[] getNameNodeInfos() {
+ return this.nameNodes;
+ }
+
+ private void initNameNodeConf(Configuration conf,
+ String nameserviceId, String nnId,
+ boolean manageNameDfsDirs, int nnIndex)
+ throws IOException {
+ if (nameserviceId != null) {
+ conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId);
+ }
+ if (nnId != null) {
+ conf.set(DFS_HA_NAMENODE_ID_KEY, nnId);
+ }
+
if (manageNameDfsDirs) {
conf.set(DFS_NAMENODE_NAME_DIR_KEY,
fileAsURI(new File(base_dir, "name" + (2*nnIndex + 1)))+","+
@@ -595,7 +703,50 @@
fileAsURI(new File(base_dir, "namesecondary" + (2*nnIndex + 1)))+","+
fileAsURI(new File(base_dir, "namesecondary" + (2*nnIndex + 2))));
}
-
+ }
+
+ private void copyNameDirs(Collection<URI> srcDirs, Collection<URI> dstDirs,
+ Configuration dstConf) throws IOException {
+ URI srcDir = Lists.newArrayList(srcDirs).get(0);
+ FileSystem dstFS = FileSystem.getLocal(dstConf).getRaw();
+ for (URI dstDir : dstDirs) {
+ Preconditions.checkArgument(!dstDir.equals(srcDir));
+ File dstDirF = new File(dstDir);
+ if (dstDirF.exists()) {
+ Files.deleteRecursively(dstDirF);
+ }
+ LOG.info("Copying namedir from primary node dir "
+ + srcDir + " to " + dstDir);
+ FileUtil.copy(
+ new File(srcDir),
+ dstFS, new Path(dstDir), false, dstConf);
+ }
+ }
+
+ /**
+ * Initialize the address and port for this NameNode. In the
+ * non-federated case, the nameservice and namenode ID may be
+ * null.
+ */
+ private static void initNameNodeAddress(Configuration conf,
+ String nameserviceId, NNConf nnConf) {
+ // Set NN-specific specific key
+ String key = DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId,
+ nnConf.getNnId());
+ conf.set(key, "127.0.0.1:" + nnConf.getHttpPort());
+
+ key = DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId,
+ nnConf.getNnId());
+ conf.set(key, "127.0.0.1:" + nnConf.getIpcPort());
+ }
+
+ private void createNameNode(int nnIndex, Configuration conf,
+ int numDataNodes, boolean format, StartupOption operation,
+ String clusterId, String nameserviceId,
+ String nnId)
+ throws IOException {
// Format and clean out DataNode directories
if (format) {
DFSTestUtil.formatNameNode(conf);
@@ -609,25 +760,20 @@
operation == StartupOption.FORMAT ||
operation == StartupOption.REGULAR) ?
new String[] {} : new String[] {operation.getName()};
- return NameNode.createNameNode(args, conf);
- }
-
- private void createFederatedNameNode(int nnIndex, Configuration conf,
- int numDataNodes, boolean manageNameDfsDirs, boolean format,
- StartupOption operation, String clusterId, String nameserviceId)
- throws IOException {
- conf.set(DFS_FEDERATION_NAMESERVICE_ID, nameserviceId);
- NameNode nn = createNameNode(nnIndex, conf, numDataNodes, manageNameDfsDirs,
- format, operation, clusterId);
- conf.set(DFSUtil.getNameServiceIdKey(
- DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId), NetUtils
+ NameNode nn = NameNode.createNameNode(args, conf);
+
+ // After the NN has started, set back the bound ports into
+ // the conf
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, nameserviceId, nnId), NetUtils
.getHostPortString(nn.getNameNodeAddress()));
- conf.set(DFSUtil.getNameServiceIdKey(
- DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId), NetUtils
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_HTTP_ADDRESS_KEY, nameserviceId, nnId), NetUtils
.getHostPortString(nn.getHttpAddress()));
- DFSUtil.setGenericConf(conf, nameserviceId,
+ DFSUtil.setGenericConf(conf, nameserviceId, nnId,
DFS_NAMENODE_HTTP_ADDRESS_KEY);
- nameNodes[nnIndex] = new NameNodeInfo(nn, new Configuration(conf));
+ nameNodes[nnIndex] = new NameNodeInfo(nn, nameserviceId, nnId,
+ new Configuration(conf));
}
/**
@@ -652,6 +798,10 @@
}
return uri;
}
+
+ public int getInstanceId() {
+ return instanceId;
+ }
/**
* @return Configuration of for the given namenode
@@ -1005,7 +1155,14 @@
*/
public NamenodeProtocols getNameNodeRpc() {
checkSingleNameNode();
- return getNameNode(0).getRpcServer();
+ return getNameNodeRpc(0);
+ }
+
+ /**
+ * Get an instance of the NameNode's RPC handler.
+ */
+ public NamenodeProtocols getNameNodeRpc(int nnIndex) {
+ return getNameNode(nnIndex).getRpcServer();
}
/**
@@ -1075,6 +1232,7 @@
LOG.info("Shutting down the Mini HDFS Cluster");
shutdownDataNodes();
for (NameNodeInfo nnInfo : nameNodes) {
+ if (nnInfo == null) continue;
NameNode nameNode = nnInfo.nameNode;
if (nameNode != null) {
nameNode.stop();
@@ -1116,7 +1274,16 @@
nn.stop();
nn.join();
Configuration conf = nameNodes[nnIndex].conf;
- nameNodes[nnIndex] = new NameNodeInfo(null, conf);
+ nameNodes[nnIndex] = new NameNodeInfo(null, null, null, conf);
+ }
+ }
+
+ /**
+ * Restart all namenodes.
+ */
+ public synchronized void restartNameNodes() throws IOException {
+ for (int i = 0; i < nameNodes.length; i++) {
+ restartNameNode(i);
}
}
@@ -1150,10 +1317,12 @@
*/
public synchronized void restartNameNode(int nnIndex, boolean waitActive)
throws IOException {
+ String nameserviceId = nameNodes[nnIndex].nameserviceId;
+ String nnId = nameNodes[nnIndex].nnId;
Configuration conf = nameNodes[nnIndex].conf;
shutdownNameNode(nnIndex);
NameNode nn = NameNode.createNameNode(new String[] {}, conf);
- nameNodes[nnIndex] = new NameNodeInfo(nn, conf);
+ nameNodes[nnIndex] = new NameNodeInfo(nn, nameserviceId, nnId, conf);
if (waitActive) {
waitClusterUp();
LOG.info("Restarted the namenode");
@@ -1345,17 +1514,11 @@
return false;
}
long[] sizes;
- try {
- sizes = nameNode.getRpcServer().getStats();
- } catch (IOException ioe) {
- // This method above should never throw.
- // It only throws IOE since it is exposed via RPC
- throw (AssertionError)(new AssertionError("Unexpected IOE thrown: "
- + StringUtils.stringifyException(ioe)).initCause(ioe));
- }
+ sizes = NameNodeAdapter.getStats(nameNode.getNamesystem());
boolean isUp = false;
synchronized (this) {
- isUp = ((!nameNode.isInSafeMode() || !waitSafeMode) && sizes[0] != 0);
+ isUp = ((!nameNode.isInSafeMode() || !waitSafeMode) &&
+ sizes[ClientProtocol.GET_STATS_CAPACITY_IDX] != 0);
}
return isUp;
}
@@ -1459,9 +1622,48 @@
/**
* Get the directories where the namenode stores its edits.
*/
- public Collection<URI> getNameEditsDirs(int nnIndex) {
+ public Collection<URI> getNameEditsDirs(int nnIndex) throws IOException {
return FSNamesystem.getNamespaceEditsDirs(nameNodes[nnIndex].conf);
}
+
+ private HAServiceProtocol getHaServiceClient(int nnIndex) throws IOException {
+ InetSocketAddress addr = nameNodes[nnIndex].nameNode.getServiceRpcAddress();
+ return new HAServiceProtocolClientSideTranslatorPB(addr, conf);
+ }
+
+ public void transitionToActive(int nnIndex) throws IOException,
+ ServiceFailedException {
+ HAServiceProtocolHelper.transitionToActive(getHaServiceClient(nnIndex));
+ }
+
+ public void transitionToStandby(int nnIndex) throws IOException,
+ ServiceFailedException {
+ HAServiceProtocolHelper.transitionToStandby(getHaServiceClient(nnIndex));
+ }
+
+
+ public void triggerBlockReports()
+ throws IOException {
+ for (DataNode dn : getDataNodes()) {
+ DataNodeAdapter.triggerBlockReport(dn);
+ }
+ }
+
+
+ public void triggerDeletionReports()
+ throws IOException {
+ for (DataNode dn : getDataNodes()) {
+ DataNodeAdapter.triggerDeletionReport(dn);
+ }
+ }
+
+ public void triggerHeartbeats()
+ throws IOException {
+ for (DataNode dn : getDataNodes()) {
+ DataNodeAdapter.triggerHeartbeat(dn);
+ }
+ }
+
/** Wait until the given namenode gets registration from all the datanodes */
public void waitActive(int nnIndex) throws IOException {
@@ -1469,6 +1671,7 @@
return;
}
InetSocketAddress addr = nameNodes[nnIndex].nameNode.getServiceRpcAddress();
+ assert addr.getPort() != 0;
DFSClient client = new DFSClient(addr, conf);
// ensure all datanodes have registered and sent heartbeat to the namenode
@@ -1512,9 +1715,9 @@
// If a datanode failed to start, then do not wait
for (DataNodeProperties dn : dataNodes) {
// the datanode thread communicating with the namenode should be alive
- if (!dn.datanode.isBPServiceAlive(addr)) {
- LOG.warn("BPOfferService failed to start in datanode " + dn.datanode
- + " for namenode at " + addr);
+ if (!dn.datanode.isConnectedToNN(addr)) {
+ LOG.warn("BPOfferService in datanode " + dn.datanode
+ + " failed to connect to namenode at " + addr);
return false;
}
}
@@ -1653,6 +1856,10 @@
public void setLeasePeriod(long soft, long hard) {
NameNodeAdapter.setLeasePeriod(getNamesystem(), soft, hard);
}
+
+ public void setWaitSafeMode(boolean wait) {
+ this.waitSafeMode = wait;
+ }
/**
* Returns the current set of datanodes
@@ -1869,7 +2076,7 @@
throws IOException {
if(!federation)
throw new IOException("cannot add namenode to non-federated cluster");
-
+
int nnIndex = nameNodes.length;
int numNameNodes = nameNodes.length + 1;
NameNodeInfo[] newlist = new NameNodeInfo[numNameNodes];
@@ -1880,10 +2087,13 @@
String nameserviceIds = conf.get(DFS_FEDERATION_NAMESERVICES);
nameserviceIds += "," + nameserviceId;
conf.set(DFS_FEDERATION_NAMESERVICES, nameserviceIds);
-
- initFederatedNamenodeAddress(conf, nameserviceId, namenodePort);
- createFederatedNameNode(nnIndex, conf, numDataNodes, true, true, null,
- null, nameserviceId);
+
+ String nnId = null;
+ initNameNodeAddress(conf, nameserviceId,
+ new NNConf(nnId).setIpcPort(namenodePort));
+ initNameNodeConf(conf, nameserviceId, nnId, true, nnIndex);
+ createNameNode(nnIndex, conf, numDataNodes, true, null, null,
+ nameserviceId, nnId);
// Refresh datanodes with the newly started namenode
for (DataNodeProperties dn : dataNodes) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java
new file mode 100644
index 0000000..4dfbfd8
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSNNTopology.java
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * This class is used to specify the setup of namenodes when instantiating
+ * a MiniDFSCluster. It consists of a set of nameservices, each of which
+ * may have one or more namenodes (in the case of HA)
+ */
+@InterfaceAudience.LimitedPrivate({"HBase", "HDFS", "Hive", "MapReduce", "Pig"})
+@InterfaceStability.Unstable
+public class MiniDFSNNTopology {
+ private final List<NSConf> nameservices = Lists.newArrayList();
+ private boolean federation;
+
+ public MiniDFSNNTopology() {
+ }
+
+ /**
+ * Set up a simple non-federated non-HA NN.
+ */
+ public static MiniDFSNNTopology simpleSingleNN(
+ int nameNodePort, int nameNodeHttpPort) {
+ return new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf(null)
+ .addNN(new MiniDFSNNTopology.NNConf(null)
+ .setHttpPort(nameNodeHttpPort)
+ .setIpcPort(nameNodePort)));
+ }
+
+
+ /**
+ * Set up an HA topology with a single HA nameservice.
+ */
+ public static MiniDFSNNTopology simpleHATopology() {
+ return new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf("minidfs-ns")
+ .addNN(new MiniDFSNNTopology.NNConf("nn1"))
+ .addNN(new MiniDFSNNTopology.NNConf("nn2")));
+ }
+
+ /**
+ * Set up federated cluster with the given number of nameservices, each
+ * of which has only a single NameNode.
+ */
+ public static MiniDFSNNTopology simpleFederatedTopology(
+ int numNameservices) {
+ MiniDFSNNTopology topology = new MiniDFSNNTopology();
+ for (int i = 1; i <= numNameservices; i++) {
+ topology.addNameservice(new MiniDFSNNTopology.NSConf("ns" + i)
+ .addNN(new MiniDFSNNTopology.NNConf(null)));
+ }
+ topology.setFederation(true);
+ return topology;
+ }
+
+ /**
+ * Set up federated cluster with the given number of nameservices, each
+ * of which has two NameNodes.
+ */
+ public static MiniDFSNNTopology simpleHAFederatedTopology(
+ int numNameservices) {
+ MiniDFSNNTopology topology = new MiniDFSNNTopology();
+ for (int i = 0; i < numNameservices; i++) {
+ topology.addNameservice(new MiniDFSNNTopology.NSConf("ns" + i)
+ .addNN(new MiniDFSNNTopology.NNConf("nn0"))
+ .addNN(new MiniDFSNNTopology.NNConf("nn1")));
+ }
+ topology.setFederation(true);
+ return topology;
+ }
+
+ public MiniDFSNNTopology setFederation(boolean federation) {
+ this.federation = federation;
+ return this;
+ }
+
+ public MiniDFSNNTopology addNameservice(NSConf nameservice) {
+ Preconditions.checkArgument(!nameservice.getNNs().isEmpty(),
+ "Must have at least one NN in a nameservice");
+ this.nameservices.add(nameservice);
+ return this;
+ }
+
+ public int countNameNodes() {
+ int count = 0;
+ for (NSConf ns : nameservices) {
+ count += ns.nns.size();
+ }
+ return count;
+ }
+
+ public NNConf getOnlyNameNode() {
+ Preconditions.checkState(countNameNodes() == 1,
+ "must have exactly one NN!");
+ return nameservices.get(0).getNNs().get(0);
+ }
+
+ public boolean isFederated() {
+ return nameservices.size() > 1 || federation;
+ }
+
+ /**
+ * @return true if at least one of the nameservices
+ * in the topology has HA enabled.
+ */
+ public boolean isHA() {
+ for (NSConf ns : nameservices) {
+ if (ns.getNNs().size() > 1) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @return true if all of the NNs in the cluster have their HTTP
+ * port specified to be non-ephemeral.
+ */
+ public boolean allHttpPortsSpecified() {
+ for (NSConf ns : nameservices) {
+ for (NNConf nn : ns.getNNs()) {
+ if (nn.getHttpPort() == 0) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * @return true if all of the NNs in the cluster have their IPC
+ * port specified to be non-ephemeral.
+ */
+ public boolean allIpcPortsSpecified() {
+ for (NSConf ns : nameservices) {
+ for (NNConf nn : ns.getNNs()) {
+ if (nn.getIpcPort() == 0) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ public List<NSConf> getNameservices() {
+ return nameservices;
+ }
+
+ public static class NSConf {
+ private final String id;
+ private final List<NNConf> nns = Lists.newArrayList();
+
+ public NSConf(String id) {
+ this.id = id;
+ }
+
+ public NSConf addNN(NNConf nn) {
+ this.nns.add(nn);
+ return this;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public List<NNConf> getNNs() {
+ return nns;
+ }
+ }
+
+ public static class NNConf {
+ private String nnId;
+ private int httpPort;
+ private int ipcPort;
+
+ public NNConf(String nnId) {
+ this.nnId = nnId;
+ }
+
+ String getNnId() {
+ return nnId;
+ }
+
+ int getIpcPort() {
+ return ipcPort;
+ }
+
+ int getHttpPort() {
+ return httpPort;
+ }
+
+ public NNConf setHttpPort(int httpPort) {
+ this.httpPort = httpPort;
+ return this;
+ }
+
+ public NNConf setIpcPort(int ipcPort) {
+ this.ipcPort = ipcPort;
+ return this;
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java
new file mode 100644
index 0000000..a88e8a7
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSClientFailover.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider;
+import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.util.StringUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestDFSClientFailover {
+
+ private static final Log LOG = LogFactory.getLog(TestDFSClientFailover.class);
+
+ private static final Path TEST_FILE = new Path("/tmp/failover-test-file");
+ private static final int FILE_LENGTH_TO_VERIFY = 100;
+
+ private Configuration conf = new Configuration();
+ private MiniDFSCluster cluster;
+
+ @Before
+ public void setUpCluster() throws IOException {
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .build();
+ cluster.transitionToActive(0);
+ cluster.waitActive();
+ }
+
+ @After
+ public void tearDownCluster() throws IOException {
+ cluster.shutdown();
+ }
+
+ /**
+ * Make sure that client failover works when an active NN dies and the standby
+ * takes over.
+ */
+ @Test
+ public void testDfsClientFailover() throws IOException, URISyntaxException {
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ DFSTestUtil.createFile(fs, TEST_FILE,
+ FILE_LENGTH_TO_VERIFY, (short)1, 1L);
+
+ assertEquals(fs.getFileStatus(TEST_FILE).getLen(), FILE_LENGTH_TO_VERIFY);
+ cluster.shutdownNameNode(0);
+ cluster.transitionToActive(1);
+ assertEquals(fs.getFileStatus(TEST_FILE).getLen(), FILE_LENGTH_TO_VERIFY);
+
+ // Check that it functions even if the URL becomes canonicalized
+ // to include a port number.
+ Path withPort = new Path("hdfs://" +
+ HATestUtil.getLogicalHostname(cluster) + ":" +
+ NameNode.DEFAULT_PORT + "/" + TEST_FILE.toUri().getPath());
+ FileSystem fs2 = withPort.getFileSystem(fs.getConf());
+ assertTrue(fs2.exists(withPort));
+
+ fs.close();
+ }
+
+ /**
+ * Regression test for HDFS-2683.
+ */
+ @Test
+ public void testLogicalUriShouldNotHavePorts() {
+ Configuration conf = new HdfsConfiguration();
+ conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + ".foo",
+ ConfiguredFailoverProxyProvider.class.getName());
+ Path p = new Path("hdfs://foo:12345/");
+ try {
+ p.getFileSystem(conf).exists(p);
+ fail("Did not fail with fake FS");
+ } catch (IOException ioe) {
+ GenericTestUtils.assertExceptionContains(
+ "does not use port information", ioe);
+ }
+ }
+
+ /**
+ * Make sure that a helpful error message is shown if a proxy provider is
+ * configured for a given URI, but no actual addresses are configured for that
+ * URI.
+ */
+ @Test
+ public void testFailureWithMisconfiguredHaNNs() throws Exception {
+ String logicalHost = "misconfigured-ha-uri";
+ Configuration conf = new Configuration();
+ conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + logicalHost,
+ ConfiguredFailoverProxyProvider.class.getName());
+
+ URI uri = new URI("hdfs://" + logicalHost + "/test");
+ try {
+ FileSystem.get(uri, conf).exists(new Path("/test"));
+ fail("Successfully got proxy provider for misconfigured FS");
+ } catch (IOException ioe) {
+ LOG.info("got expected exception", ioe);
+ assertTrue("expected exception did not contain helpful message",
+ StringUtils.stringifyException(ioe).contains(
+ "Could not find any configured addresses for URI " + uri));
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java
index a308c23..ad3e6d8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java
@@ -34,6 +34,7 @@
import static org.apache.hadoop.hdfs.server.namenode.NNStorage.getInProgressEditsFileName;
import static org.apache.hadoop.hdfs.server.namenode.NNStorage.getImageFileName;
+import static org.apache.hadoop.test.GenericTestUtils.assertExists;
import org.apache.hadoop.util.StringUtils;
import org.junit.BeforeClass;
import org.junit.Ignore;
@@ -51,7 +52,7 @@
*/
public class TestDFSUpgrade {
- private static final int EXPECTED_TXID = 17;
+ private static final int EXPECTED_TXID = 49;
private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName());
private Configuration conf;
private int testCounter = 0;
@@ -80,16 +81,16 @@
Joiner.on(" \n").join(new File(baseDir, "current").list()));
LOG.info("==================");
- assertTrue(new File(baseDir,"current").isDirectory());
- assertTrue(new File(baseDir,"current/VERSION").isFile());
- assertTrue(new File(baseDir,"current/"
- + getInProgressEditsFileName(imageTxId + 1)).isFile());
- assertTrue(new File(baseDir,"current/"
- + getImageFileName(imageTxId)).isFile());
- assertTrue(new File(baseDir,"current/seen_txid").isFile());
+ assertExists(new File(baseDir,"current"));
+ assertExists(new File(baseDir,"current/VERSION"));
+ assertExists(new File(baseDir,"current/"
+ + getInProgressEditsFileName(imageTxId + 1)));
+ assertExists(new File(baseDir,"current/"
+ + getImageFileName(imageTxId)));
+ assertExists(new File(baseDir,"current/seen_txid"));
File previous = new File(baseDir, "previous");
- assertTrue(previous.isDirectory());
+ assertExists(previous);
assertEquals(UpgradeUtilities.checksumContents(NAME_NODE, previous),
UpgradeUtilities.checksumMasterNameNodeContents());
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java
index 3b93aeb..ef8f850 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUtil.java
@@ -18,16 +18,20 @@
package org.apache.hadoop.hdfs;
+import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.*;
import java.io.IOException;
import java.net.InetSocketAddress;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
@@ -39,9 +43,20 @@
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
public class TestDFSUtil {
+
+ /**
+ * Reset to default UGI settings since some tests change them.
+ */
+ @Before
+ public void resetUGI() {
+ UserGroupInformation.setConfiguration(new Configuration());
+ }
+
/**
* Test conversion of LocatedBlock to BlockLocation
*/
@@ -86,7 +101,7 @@
private Configuration setupAddress(String key) {
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFS_FEDERATION_NAMESERVICES, "nn1");
- conf.set(DFSUtil.getNameServiceIdKey(key, "nn1"), "localhost:9000");
+ conf.set(DFSUtil.addKeySuffixes(key, "nn1"), "localhost:9000");
return conf;
}
@@ -102,7 +117,7 @@
}
/**
- * Test {@link DFSUtil#getNameNodeNameServiceId(Configuration)} to ensure
+ * Test {@link DFSUtil#getNamenodeNameServiceId(Configuration)} to ensure
* nameserviceId for namenode is determined based on matching the address with
* local node's address
*/
@@ -135,7 +150,7 @@
}
/**
- * Test {@link DFSUtil#getNameServiceId(Configuration, String))} to ensure
+ * Test {@link DFSUtil#getNamenodeNameServiceId(Configuration)} to ensure
* exception is thrown when multiple rpc addresses match the local node's
* address
*/
@@ -143,9 +158,9 @@
public void testGetNameServiceIdException() {
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFS_FEDERATION_NAMESERVICES, "nn1,nn2");
- conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"),
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"),
"localhost:9000");
- conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"),
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"),
"localhost:9001");
DFSUtil.getNamenodeNameServiceId(conf);
fail("Expected exception is not thrown");
@@ -178,19 +193,24 @@
final String NN1_ADDRESS = "localhost:9000";
final String NN2_ADDRESS = "localhost:9001";
final String NN3_ADDRESS = "localhost:9002";
- conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"),
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn1"),
NN1_ADDRESS);
- conf.set(DFSUtil.getNameServiceIdKey(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"),
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY, "nn2"),
NN2_ADDRESS);
- Collection<InetSocketAddress> nnAddresses = DFSUtil
+ Map<String, Map<String, InetSocketAddress>> nnMap = DFSUtil
.getNNServiceRpcAddresses(conf);
- assertEquals(2, nnAddresses.size());
- Iterator<InetSocketAddress> iterator = nnAddresses.iterator();
- InetSocketAddress addr = iterator.next();
+ assertEquals(2, nnMap.size());
+
+ Map<String, InetSocketAddress> nn1Map = nnMap.get("nn1");
+ assertEquals(1, nn1Map.size());
+ InetSocketAddress addr = nn1Map.get(null);
assertEquals("localhost", addr.getHostName());
assertEquals(9000, addr.getPort());
- addr = iterator.next();
+
+ Map<String, InetSocketAddress> nn2Map = nnMap.get("nn2");
+ assertEquals(1, nn2Map.size());
+ addr = nn2Map.get(null);
assertEquals("localhost", addr.getHostName());
assertEquals(9001, addr.getPort());
@@ -198,6 +218,10 @@
checkNameServiceId(conf, NN1_ADDRESS, "nn1");
checkNameServiceId(conf, NN2_ADDRESS, "nn2");
checkNameServiceId(conf, NN3_ADDRESS, null);
+
+ // HA is not enabled in a purely federated config
+ assertFalse(HAUtil.isHAEnabled(conf, "nn1"));
+ assertFalse(HAUtil.isHAEnabled(conf, "nn2"));
}
public void checkNameServiceId(Configuration conf, String addr,
@@ -216,9 +240,14 @@
conf.set(FS_DEFAULT_NAME_KEY, hdfs_default);
// If DFS_FEDERATION_NAMESERVICES is not set, verify that
// default namenode address is returned.
- List<InetSocketAddress> addrList = DFSUtil.getNNServiceRpcAddresses(conf);
- assertEquals(1, addrList.size());
- assertEquals(9999, addrList.get(0).getPort());
+ Map<String, Map<String, InetSocketAddress>> addrMap =
+ DFSUtil.getNNServiceRpcAddresses(conf);
+ assertEquals(1, addrMap.size());
+
+ Map<String, InetSocketAddress> defaultNsMap = addrMap.get(null);
+ assertEquals(1, defaultNsMap.size());
+
+ assertEquals(9999, defaultNsMap.get(null).getPort());
}
/**
@@ -226,20 +255,51 @@
* copied to generic keys when the namenode starts.
*/
@Test
- public void testConfModification() throws IOException {
+ public void testConfModificationFederationOnly() {
final HdfsConfiguration conf = new HdfsConfiguration();
- conf.set(DFS_FEDERATION_NAMESERVICES, "nn1");
- conf.set(DFS_FEDERATION_NAMESERVICE_ID, "nn1");
- final String nameserviceId = DFSUtil.getNamenodeNameServiceId(conf);
+ String nsId = "ns1";
+
+ conf.set(DFS_FEDERATION_NAMESERVICES, nsId);
+ conf.set(DFS_FEDERATION_NAMESERVICE_ID, nsId);
// Set the nameservice specific keys with nameserviceId in the config key
for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) {
// Note: value is same as the key
- conf.set(DFSUtil.getNameServiceIdKey(key, nameserviceId), key);
+ conf.set(DFSUtil.addKeySuffixes(key, nsId), key);
}
// Initialize generic keys from specific keys
- NameNode.initializeGenericKeys(conf, nameserviceId);
+ NameNode.initializeGenericKeys(conf, nsId, null);
+
+ // Retrieve the keys without nameserviceId and Ensure generic keys are set
+ // to the correct value
+ for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) {
+ assertEquals(key, conf.get(key));
+ }
+ }
+
+ /**
+ * Test to ensure nameservice specific keys in the configuration are
+ * copied to generic keys when the namenode starts.
+ */
+ @Test
+ public void testConfModificationFederationAndHa() {
+ final HdfsConfiguration conf = new HdfsConfiguration();
+ String nsId = "ns1";
+ String nnId = "nn1";
+
+ conf.set(DFS_FEDERATION_NAMESERVICES, nsId);
+ conf.set(DFS_FEDERATION_NAMESERVICE_ID, nsId);
+ conf.set(DFS_HA_NAMENODES_KEY_PREFIX + "." + nsId, nnId);
+
+ // Set the nameservice specific keys with nameserviceId in the config key
+ for (String key : NameNode.NAMESERVICE_SPECIFIC_KEYS) {
+ // Note: value is same as the key
+ conf.set(DFSUtil.addKeySuffixes(key, nsId, nnId), key);
+ }
+
+ // Initialize generic keys from specific keys
+ NameNode.initializeGenericKeys(conf, nsId, nnId);
// Retrieve the keys without nameserviceId and Ensure generic keys are set
// to the correct value
@@ -249,6 +309,39 @@
}
/**
+ * Regression test for HDFS-2934.
+ */
+ @Test
+ public void testSomeConfsNNSpecificSomeNSSpecific() {
+ final HdfsConfiguration conf = new HdfsConfiguration();
+
+ String key = DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY;
+ conf.set(key, "global-default");
+ conf.set(key + ".ns1", "ns1-override");
+ conf.set(key + ".ns1.nn1", "nn1-override");
+
+ // A namenode in another nameservice should get the global default.
+ Configuration newConf = new Configuration(conf);
+ NameNode.initializeGenericKeys(newConf, "ns2", "nn1");
+ assertEquals("global-default", newConf.get(key));
+
+ // A namenode in another non-HA nameservice should get global default.
+ newConf = new Configuration(conf);
+ NameNode.initializeGenericKeys(newConf, "ns2", null);
+ assertEquals("global-default", newConf.get(key));
+
+ // A namenode in the same nameservice should get the ns setting
+ newConf = new Configuration(conf);
+ NameNode.initializeGenericKeys(newConf, "ns1", "nn2");
+ assertEquals("ns1-override", newConf.get(key));
+
+ // The nn with the nn-specific setting should get its own override
+ newConf = new Configuration(conf);
+ NameNode.initializeGenericKeys(newConf, "ns1", "nn1");
+ assertEquals("nn1-override", newConf.get(key));
+ }
+
+ /**
* Tests for empty configuration, an exception is thrown from
* {@link DFSUtil#getNNServiceRpcAddresses(Configuration)}
* {@link DFSUtil#getBackupNodeAddresses(Configuration)}
@@ -258,21 +351,30 @@
public void testEmptyConf() {
HdfsConfiguration conf = new HdfsConfiguration(false);
try {
- DFSUtil.getNNServiceRpcAddresses(conf);
- fail("Expected IOException is not thrown");
+ Map<String, Map<String, InetSocketAddress>> map =
+ DFSUtil.getNNServiceRpcAddresses(conf);
+ fail("Expected IOException is not thrown, result was: " +
+ DFSUtil.addressMapToString(map));
} catch (IOException expected) {
+ /** Expected */
}
try {
- DFSUtil.getBackupNodeAddresses(conf);
- fail("Expected IOException is not thrown");
+ Map<String, Map<String, InetSocketAddress>> map =
+ DFSUtil.getBackupNodeAddresses(conf);
+ fail("Expected IOException is not thrown, result was: " +
+ DFSUtil.addressMapToString(map));
} catch (IOException expected) {
+ /** Expected */
}
try {
- DFSUtil.getSecondaryNameNodeAddresses(conf);
- fail("Expected IOException is not thrown");
+ Map<String, Map<String, InetSocketAddress>> map =
+ DFSUtil.getSecondaryNameNodeAddresses(conf);
+ fail("Expected IOException is not thrown, result was: " +
+ DFSUtil.addressMapToString(map));
} catch (IOException expected) {
+ /** Expected */
}
}
@@ -286,5 +388,144 @@
String httpport = DFSUtil.getInfoServer(null, conf, false);
assertEquals("0.0.0.0:50070", httpport);
}
+
+ @Test
+ public void testHANameNodesWithFederation() throws URISyntaxException {
+ HdfsConfiguration conf = new HdfsConfiguration();
+
+ final String NS1_NN1_HOST = "ns1-nn1.example.com:8020";
+ final String NS1_NN2_HOST = "ns1-nn2.example.com:8020";
+ final String NS2_NN1_HOST = "ns2-nn1.example.com:8020";
+ final String NS2_NN2_HOST = "ns2-nn2.example.com:8020";
+ conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, "hdfs://ns1");
+
+ // Two nameservices, each with two NNs.
+ conf.set(DFS_FEDERATION_NAMESERVICES, "ns1,ns2");
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"),
+ "ns1-nn1,ns1-nn2");
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns2"),
+ "ns2-nn1,ns2-nn2");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "ns1-nn1"),
+ NS1_NN1_HOST);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "ns1-nn2"),
+ NS1_NN2_HOST);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns2", "ns2-nn1"),
+ NS2_NN1_HOST);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns2", "ns2-nn2"),
+ NS2_NN2_HOST);
+
+ Map<String, Map<String, InetSocketAddress>> map =
+ DFSUtil.getHaNnRpcAddresses(conf);
-}
\ No newline at end of file
+ assertTrue(HAUtil.isHAEnabled(conf, "ns1"));
+ assertTrue(HAUtil.isHAEnabled(conf, "ns2"));
+ assertFalse(HAUtil.isHAEnabled(conf, "ns3"));
+
+ assertEquals(NS1_NN1_HOST, map.get("ns1").get("ns1-nn1").toString());
+ assertEquals(NS1_NN2_HOST, map.get("ns1").get("ns1-nn2").toString());
+ assertEquals(NS2_NN1_HOST, map.get("ns2").get("ns2-nn1").toString());
+ assertEquals(NS2_NN2_HOST, map.get("ns2").get("ns2-nn2").toString());
+
+ assertEquals(NS1_NN1_HOST,
+ DFSUtil.getNamenodeServiceAddr(conf, "ns1", "ns1-nn1"));
+ assertEquals(NS1_NN2_HOST,
+ DFSUtil.getNamenodeServiceAddr(conf, "ns1", "ns1-nn2"));
+ assertEquals(NS2_NN1_HOST,
+ DFSUtil.getNamenodeServiceAddr(conf, "ns2", "ns2-nn1"));
+
+ // No nameservice was given and we can't determine which service addr
+ // to use as two nameservices could share a namenode ID.
+ assertEquals(null, DFSUtil.getNamenodeServiceAddr(conf, null, "ns1-nn1"));
+
+ // Ditto for nameservice IDs, if multiple are defined
+ assertEquals(null, DFSUtil.getNamenodeNameServiceId(conf));
+ assertEquals(null, DFSUtil.getSecondaryNameServiceId(conf));
+
+ Collection<URI> uris = DFSUtil.getNameServiceUris(conf, DFS_NAMENODE_RPC_ADDRESS_KEY);
+ assertEquals(2, uris.size());
+ assertTrue(uris.contains(new URI("hdfs://ns1")));
+ assertTrue(uris.contains(new URI("hdfs://ns2")));
+ }
+
+ @Test
+ public void getNameNodeServiceAddr() throws IOException {
+ HdfsConfiguration conf = new HdfsConfiguration();
+
+ // One nameservice with two NNs
+ final String NS1_NN1_HOST = "ns1-nn1.example.com:8020";
+ final String NS1_NN1_HOST_SVC = "ns1-nn2.example.com:8021";
+ final String NS1_NN2_HOST = "ns1-nn1.example.com:8020";
+ final String NS1_NN2_HOST_SVC = "ns1-nn2.example.com:8021";
+
+ conf.set(DFS_FEDERATION_NAMESERVICES, "ns1");
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"),"nn1,nn2");
+
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST);
+
+ // The rpc address is used if no service address is defined
+ assertEquals(NS1_NN1_HOST, DFSUtil.getNamenodeServiceAddr(conf, null, "nn1"));
+ assertEquals(NS1_NN2_HOST, DFSUtil.getNamenodeServiceAddr(conf, null, "nn2"));
+
+ // A nameservice is specified explicitly
+ assertEquals(NS1_NN1_HOST, DFSUtil.getNamenodeServiceAddr(conf, "ns1", "nn1"));
+ assertEquals(null, DFSUtil.getNamenodeServiceAddr(conf, "invalid", "nn1"));
+
+ // The service addrs are used when they are defined
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST_SVC);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST_SVC);
+
+ assertEquals(NS1_NN1_HOST_SVC, DFSUtil.getNamenodeServiceAddr(conf, null, "nn1"));
+ assertEquals(NS1_NN2_HOST_SVC, DFSUtil.getNamenodeServiceAddr(conf, null, "nn2"));
+
+ // We can determine the nameservice ID, there's only one listed
+ assertEquals("ns1", DFSUtil.getNamenodeNameServiceId(conf));
+ assertEquals("ns1", DFSUtil.getSecondaryNameServiceId(conf));
+ }
+
+ @Test
+ public void testSubstituteForWildcardAddress() throws IOException {
+ assertEquals("foo:12345",
+ DFSUtil.substituteForWildcardAddress("0.0.0.0:12345", "foo"));
+ assertEquals("127.0.0.1:12345",
+ DFSUtil.substituteForWildcardAddress("127.0.0.1:12345", "foo"));
+ }
+
+ @Test
+ public void testGetNNUris() throws Exception {
+ HdfsConfiguration conf = new HdfsConfiguration();
+
+ final String NS1_NN1_HOST = "ns1-nn1.example.com:8020";
+ final String NS1_NN2_HOST = "ns1-nn1.example.com:8020";
+ final String NS2_NN_HOST = "ns2-nn.example.com:8020";
+ final String NN_HOST = "nn.example.com:8020";
+
+ conf.set(DFS_FEDERATION_NAMESERVICES, "ns1,ns2");
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, "ns1"),"nn1,nn2");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn1"), NS1_NN1_HOST);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFS_NAMENODE_RPC_ADDRESS_KEY, "ns1", "nn2"), NS1_NN2_HOST);
+
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "ns2"),
+ NS2_NN_HOST);
+
+ conf.set(DFS_NAMENODE_RPC_ADDRESS_KEY, "hdfs://" + NN_HOST);
+
+ Collection<URI> uris = DFSUtil.getNameServiceUris(conf, DFS_NAMENODE_RPC_ADDRESS_KEY,
+ DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY);
+
+ assertEquals(3, uris.size());
+ assertTrue(uris.contains(new URI("hdfs://ns1")));
+ assertTrue(uris.contains(new URI("hdfs://" + NS2_NN_HOST)));
+ assertTrue(uris.contains(new URI("hdfs://" + NN_HOST)));
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java
index 089ab4d..af0bf6a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDataTransferProtocol.java
@@ -302,7 +302,7 @@
testWrite(firstBlock, BlockConstructionStage.PIPELINE_SETUP_CREATE, 0L,
"Cannot create a RBW block", true);
// test PIPELINE_SETUP_APPEND on an existing block
- newGS = newBlock.getGenerationStamp() + 1;
+ newGS = firstBlock.getGenerationStamp() + 1;
testWrite(firstBlock, BlockConstructionStage.PIPELINE_SETUP_APPEND,
newGS, "Cannot append to a RBW replica", true);
// test PIPELINE_SETUP_APPEND on an existing block
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index faf7efd..6997ebc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -279,7 +279,8 @@
* @throws IOException */
private void startCluster(int numNameNodes, int numDatanodes,
Configuration conf) throws IOException {
- cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes)
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(numNameNodes))
.numDataNodes(numDatanodes).build();
cluster.waitActive();
for (int i = 0; i < numNameNodes; i++) {
@@ -507,7 +508,8 @@
InterruptedException {
conf.set(DFSConfigKeys.DFS_HOSTS, hostsFile.toUri().getPath());
int numDatanodes = 1;
- cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes)
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(numNameNodes))
.numDataNodes(numDatanodes).setupHostsFile(true).build();
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
index 033478f..e10eab8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileAppendRestart.java
@@ -90,7 +90,7 @@
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
FileSystem fs = cluster.getFileSystem();
File editLog =
- new File(FSImageTestUtil.getNameNodeCurrentDirs(cluster).get(0),
+ new File(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0).get(0),
NNStorage.getInProgressEditsFileName(1));
EnumMap<FSEditLogOpCodes, Holder<Integer>> counts;
@@ -98,18 +98,31 @@
writeAndAppend(fs, p1, BLOCK_SIZE, BLOCK_SIZE);
counts = FSImageTestUtil.countEditLogOpTypes(editLog);
+ // OP_ADD to create file
+ // OP_UPDATE_BLOCKS for first block
+ // OP_CLOSE to close file
+ // OP_ADD to reopen file
+ // OP_UPDATE_BLOCKS for second block
+ // OP_CLOSE to close file
assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_ADD).held);
+ assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_UPDATE_BLOCKS).held);
assertEquals(2, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held);
Path p2 = new Path("/not-block-boundaries");
writeAndAppend(fs, p2, BLOCK_SIZE/2, BLOCK_SIZE);
counts = FSImageTestUtil.countEditLogOpTypes(editLog);
- // We get *3* OP_ADDS from this test rather than two. The first
- // OP_ADD comes from re-opening the file to establish the lease,
- // the second comes from the updatePipeline call when the block
- // itself has its generation stamp incremented
- assertEquals(5, (int)counts.get(FSEditLogOpCodes.OP_ADD).held);
- assertEquals(4, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held);
+ // OP_ADD to create file
+ // OP_UPDATE_BLOCKS for first block
+ // OP_CLOSE to close file
+ // OP_ADD to re-establish the lease
+ // OP_UPDATE_BLOCKS from the updatePipeline call (increments genstamp of last block)
+ // OP_UPDATE_BLOCKS at the start of the second block
+ // OP_CLOSE to close file
+ // Total: 2 OP_ADDs, 3 OP_UPDATE_BLOCKS, and 2 OP_CLOSEs in addition
+ // to the ones above
+ assertEquals(2+2, (int)counts.get(FSEditLogOpCodes.OP_ADD).held);
+ assertEquals(2+3, (int)counts.get(FSEditLogOpCodes.OP_UPDATE_BLOCKS).held);
+ assertEquals(2+2, (int)counts.get(FSEditLogOpCodes.OP_CLOSE).held);
cluster.restartNameNode();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java
index af9d05c..d3df0c0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileCorruption.java
@@ -146,8 +146,14 @@
// report corrupted block by the third datanode
DatanodeRegistration dnR =
DataNodeTestUtils.getDNRegistrationForBP(dataNode, blk.getBlockPoolId());
- cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt(
- blk, new DatanodeInfo(dnR), "TEST");
+ FSNamesystem ns = cluster.getNamesystem();
+ ns.writeLock();
+ try {
+ cluster.getNamesystem().getBlockManager().findAndMarkBlockAsCorrupt(
+ blk, new DatanodeInfo(dnR), "TEST");
+ } finally {
+ ns.writeUnlock();
+ }
// open the file
fs.open(FILE_PATH);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java
index 8693885..b0878d1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestGetBlocks.java
@@ -25,7 +25,6 @@
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
-import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
@@ -34,8 +33,6 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RemoteException;
-import org.apache.hadoop.security.UserGroupInformation;
-
import junit.framework.TestCase;
/**
* This class tests if block replacement request to data nodes work correctly.
@@ -97,8 +94,8 @@
// get RPC client to namenode
InetSocketAddress addr = new InetSocketAddress("localhost",
cluster.getNameNodePort());
- NamenodeProtocol namenode = new NamenodeProtocolTranslatorPB(addr, CONF,
- UserGroupInformation.getCurrentUser());
+ NamenodeProtocol namenode = NameNodeProxies.createProxy(CONF,
+ NameNode.getUri(addr), NamenodeProtocol.class).getProxy();
// get blocks of size fileLen from dataNodes[0]
BlockWithLocations[] locs;
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java
index 0d8174e..3e90665 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestIsMethodSupported.java
@@ -22,6 +22,7 @@
import junit.framework.Assert;
+import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
@@ -31,8 +32,13 @@
import org.apache.hadoop.hdfs.protocolPB.NamenodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.RefreshAuthorizationPolicyProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.RefreshUserMappingsProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.RefreshUserMappingsProtocol;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol;
+import org.apache.hadoop.tools.GetUserMappingsProtocol;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -49,7 +55,7 @@
@BeforeClass
public static void setUp() throws Exception {
- cluster = (new MiniDFSCluster.Builder(conf)).numNameNodes(1)
+ cluster = (new MiniDFSCluster.Builder(conf))
.numDataNodes(1).build();
nnAddress = cluster.getNameNode().getNameNodeAddress();
dnAddress = new InetSocketAddress(cluster.getDataNodes().get(0)
@@ -66,8 +72,9 @@
@Test
public void testNamenodeProtocol() throws IOException {
NamenodeProtocolTranslatorPB translator =
- new NamenodeProtocolTranslatorPB(nnAddress, conf,
- UserGroupInformation.getCurrentUser());
+ (NamenodeProtocolTranslatorPB) NameNodeProxies.createNonHAProxy(conf,
+ nnAddress, NamenodeProtocol.class, UserGroupInformation.getCurrentUser(),
+ true).getProxy();
boolean exists = translator.isMethodSupported("rollEditLog");
Assert.assertTrue(exists);
exists = translator.isMethodSupported("bogusMethod");
@@ -99,15 +106,17 @@
@Test
public void testClientNamenodeProtocol() throws IOException {
ClientNamenodeProtocolTranslatorPB translator =
- new ClientNamenodeProtocolTranslatorPB(nnAddress, conf,
- UserGroupInformation.getCurrentUser());
+ (ClientNamenodeProtocolTranslatorPB) NameNodeProxies.createNonHAProxy(
+ conf, nnAddress, ClientProtocol.class,
+ UserGroupInformation.getCurrentUser(), true).getProxy();
Assert.assertTrue(translator.isMethodSupported("mkdirs"));
}
@Test
public void tesJournalProtocol() throws IOException {
- JournalProtocolTranslatorPB translator =
- new JournalProtocolTranslatorPB(nnAddress, conf);
+ JournalProtocolTranslatorPB translator = (JournalProtocolTranslatorPB)
+ NameNodeProxies.createNonHAProxy(conf, nnAddress, JournalProtocol.class,
+ UserGroupInformation.getCurrentUser(), true).getProxy();
//Nameode doesn't implement JournalProtocol
Assert.assertFalse(translator.isMethodSupported("startLogSegment"));
}
@@ -130,24 +139,30 @@
@Test
public void testGetUserMappingsProtocol() throws IOException {
GetUserMappingsProtocolClientSideTranslatorPB translator =
- new GetUserMappingsProtocolClientSideTranslatorPB(
- nnAddress, UserGroupInformation.getCurrentUser(), conf);
+ (GetUserMappingsProtocolClientSideTranslatorPB)
+ NameNodeProxies.createNonHAProxy(conf, nnAddress,
+ GetUserMappingsProtocol.class, UserGroupInformation.getCurrentUser(),
+ true).getProxy();
Assert.assertTrue(translator.isMethodSupported("getGroupsForUser"));
}
@Test
public void testRefreshAuthorizationPolicyProtocol() throws IOException {
- RefreshAuthorizationPolicyProtocolClientSideTranslatorPB translator =
- new RefreshAuthorizationPolicyProtocolClientSideTranslatorPB(
- nnAddress, UserGroupInformation.getCurrentUser(), conf);
+ RefreshAuthorizationPolicyProtocolClientSideTranslatorPB translator =
+ (RefreshAuthorizationPolicyProtocolClientSideTranslatorPB)
+ NameNodeProxies.createNonHAProxy(conf, nnAddress,
+ RefreshAuthorizationPolicyProtocol.class,
+ UserGroupInformation.getCurrentUser(), true).getProxy();
Assert.assertTrue(translator.isMethodSupported("refreshServiceAcl"));
}
@Test
public void testRefreshUserMappingsProtocol() throws IOException {
RefreshUserMappingsProtocolClientSideTranslatorPB translator =
- new RefreshUserMappingsProtocolClientSideTranslatorPB(
- nnAddress, UserGroupInformation.getCurrentUser(), conf);
+ (RefreshUserMappingsProtocolClientSideTranslatorPB)
+ NameNodeProxies.createNonHAProxy(conf, nnAddress,
+ RefreshUserMappingsProtocol.class,
+ UserGroupInformation.getCurrentUser(), true).getProxy();
Assert.assertTrue(
translator.isMethodSupported("refreshUserToGroupsMappings"));
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java
index 4e31523..0eec0d1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestMiniDFSCluster.java
@@ -20,6 +20,7 @@
import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -37,11 +38,13 @@
private static final String CLUSTER_1 = "cluster1";
private static final String CLUSTER_2 = "cluster2";
private static final String CLUSTER_3 = "cluster3";
+ private static final String CLUSTER_4 = "cluster4";
protected String testDataPath;
protected File testDataDir;
@Before
public void setUp() {
- testDataPath = System.getProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA);
+ testDataPath = System.getProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA,
+ "build/test/data");
testDataDir = new File(new File(testDataPath).getParentFile(),
"miniclusters");
@@ -103,5 +106,21 @@
}
}
-
+ @Test(timeout=100000)
+ public void testIsClusterUpAfterShutdown() throws Throwable {
+ Configuration conf = new HdfsConfiguration();
+ File testDataCluster4 = new File(testDataPath, CLUSTER_4);
+ String c4Path = testDataCluster4.getAbsolutePath();
+ conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, c4Path);
+ MiniDFSCluster cluster4 = new MiniDFSCluster.Builder(conf).build();
+ try {
+ DistributedFileSystem dfs = (DistributedFileSystem) cluster4.getFileSystem();
+ dfs.setSafeMode(FSConstants.SafeModeAction.SAFEMODE_ENTER);
+ cluster4.shutdown();
+ } finally {
+ while(cluster4.isClusterUp()){
+ Thread.sleep(1000);
+ }
+ }
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java
new file mode 100644
index 0000000..cb98929
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestPersistBlocks.java
@@ -0,0 +1,353 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs;
+
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.INodeFileUnderConstruction;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+import static org.junit.Assert.*;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A JUnit test for checking if restarting DFS preserves the
+ * blocks that are part of an unclosed file.
+ */
+public class TestPersistBlocks {
+ static {
+ ((Log4JLogger)FSImage.LOG).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)FSNamesystem.LOG).getLogger().setLevel(Level.ALL);
+ }
+
+ private static final int BLOCK_SIZE = 4096;
+ private static final int NUM_BLOCKS = 5;
+
+ private static final String FILE_NAME = "/data";
+ private static final Path FILE_PATH = new Path(FILE_NAME);
+
+ static final byte[] DATA_BEFORE_RESTART = new byte[BLOCK_SIZE * NUM_BLOCKS];
+ static final byte[] DATA_AFTER_RESTART = new byte[BLOCK_SIZE * NUM_BLOCKS];
+
+ private static final String HADOOP_1_0_MULTIBLOCK_TGZ =
+ "hadoop-1.0-multiblock-file.tgz";
+ static {
+ Random rand = new Random();
+ rand.nextBytes(DATA_BEFORE_RESTART);
+ rand.nextBytes(DATA_AFTER_RESTART);
+ }
+
+ /** check if DFS remains in proper condition after a restart */
+ @Test
+ public void testRestartDfs() throws Exception {
+ final Configuration conf = new HdfsConfiguration();
+ // Turn off persistent IPC, so that the DFSClient can survive NN restart
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY,
+ 0);
+ conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true);
+ MiniDFSCluster cluster = null;
+
+ long len = 0;
+ FSDataOutputStream stream;
+ try {
+ cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+ FileSystem fs = cluster.getFileSystem();
+ // Creating a file with 4096 blockSize to write multiple blocks
+ stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
+ stream.write(DATA_BEFORE_RESTART);
+ stream.hflush();
+
+ // Wait for at least a few blocks to get through
+ while (len <= BLOCK_SIZE) {
+ FileStatus status = fs.getFileStatus(FILE_PATH);
+ len = status.getLen();
+ Thread.sleep(100);
+ }
+
+ // explicitly do NOT close the file.
+ cluster.restartNameNode();
+
+ // Check that the file has no less bytes than before the restart
+ // This would mean that blocks were successfully persisted to the log
+ FileStatus status = fs.getFileStatus(FILE_PATH);
+ assertTrue("Length too short: " + status.getLen(),
+ status.getLen() >= len);
+
+ // And keep writing (ensures that leases are also persisted correctly)
+ stream.write(DATA_AFTER_RESTART);
+ stream.close();
+
+ // Verify that the data showed up, both from before and after the restart.
+ FSDataInputStream readStream = fs.open(FILE_PATH);
+ try {
+ byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length];
+ IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length);
+ assertArrayEquals(DATA_BEFORE_RESTART, verifyBuf);
+
+ IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length);
+ assertArrayEquals(DATA_AFTER_RESTART, verifyBuf);
+ } finally {
+ IOUtils.closeStream(readStream);
+ }
+ } finally {
+ if (cluster != null) { cluster.shutdown(); }
+ }
+ }
+
+ @Test
+ public void testRestartDfsWithAbandonedBlock() throws Exception {
+ final Configuration conf = new HdfsConfiguration();
+ // Turn off persistent IPC, so that the DFSClient can survive NN restart
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY,
+ 0);
+ conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true);
+ MiniDFSCluster cluster = null;
+
+ long len = 0;
+ FSDataOutputStream stream;
+ try {
+ cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+ FileSystem fs = cluster.getFileSystem();
+ // Creating a file with 4096 blockSize to write multiple blocks
+ stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
+ stream.write(DATA_BEFORE_RESTART);
+ stream.hflush();
+
+ // Wait for all of the blocks to get through
+ while (len < BLOCK_SIZE * (NUM_BLOCKS - 1)) {
+ FileStatus status = fs.getFileStatus(FILE_PATH);
+ len = status.getLen();
+ Thread.sleep(100);
+ }
+
+ // Abandon the last block
+ DFSClient dfsclient = DFSClientAdapter.getDFSClient((DistributedFileSystem)fs);
+ LocatedBlocks blocks = dfsclient.getNamenode().getBlockLocations(
+ FILE_NAME, 0, BLOCK_SIZE * NUM_BLOCKS);
+ assertEquals(NUM_BLOCKS, blocks.getLocatedBlocks().size());
+ LocatedBlock b = blocks.getLastLocatedBlock();
+ dfsclient.getNamenode().abandonBlock(b.getBlock(), FILE_NAME,
+ dfsclient.clientName);
+
+ // explicitly do NOT close the file.
+ cluster.restartNameNode();
+
+ // Check that the file has no less bytes than before the restart
+ // This would mean that blocks were successfully persisted to the log
+ FileStatus status = fs.getFileStatus(FILE_PATH);
+ assertTrue("Length incorrect: " + status.getLen(),
+ status.getLen() != len - BLOCK_SIZE);
+
+ // Verify the data showed up from before restart, sans abandoned block.
+ FSDataInputStream readStream = fs.open(FILE_PATH);
+ try {
+ byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length - BLOCK_SIZE];
+ IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length);
+ byte[] expectedBuf = new byte[DATA_BEFORE_RESTART.length - BLOCK_SIZE];
+ System.arraycopy(DATA_BEFORE_RESTART, 0,
+ expectedBuf, 0, expectedBuf.length);
+ assertArrayEquals(expectedBuf, verifyBuf);
+ } finally {
+ IOUtils.closeStream(readStream);
+ }
+ } finally {
+ if (cluster != null) { cluster.shutdown(); }
+ }
+ }
+
+ @Test
+ public void testRestartWithPartialBlockHflushed() throws IOException {
+ final Configuration conf = new HdfsConfiguration();
+ // Turn off persistent IPC, so that the DFSClient can survive NN restart
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY,
+ 0);
+ conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true);
+ MiniDFSCluster cluster = null;
+
+ FSDataOutputStream stream;
+ try {
+ cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+ FileSystem fs = cluster.getFileSystem();
+ NameNode.getAddress(conf).getPort();
+ // Creating a file with 4096 blockSize to write multiple blocks
+ stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
+ stream.write(DATA_BEFORE_RESTART);
+ stream.write((byte)1);
+ stream.hflush();
+
+ // explicitly do NOT close the file before restarting the NN.
+ cluster.restartNameNode();
+
+ // this will fail if the final block of the file is prematurely COMPLETEd
+ stream.write((byte)2);
+ stream.hflush();
+ stream.close();
+
+ assertEquals(DATA_BEFORE_RESTART.length + 2,
+ fs.getFileStatus(FILE_PATH).getLen());
+
+ FSDataInputStream readStream = fs.open(FILE_PATH);
+ try {
+ byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length + 2];
+ IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length);
+ byte[] expectedBuf = new byte[DATA_BEFORE_RESTART.length + 2];
+ System.arraycopy(DATA_BEFORE_RESTART, 0, expectedBuf, 0,
+ DATA_BEFORE_RESTART.length);
+ System.arraycopy(new byte[]{1, 2}, 0, expectedBuf,
+ DATA_BEFORE_RESTART.length, 2);
+ assertArrayEquals(expectedBuf, verifyBuf);
+ } finally {
+ IOUtils.closeStream(readStream);
+ }
+ } finally {
+ if (cluster != null) { cluster.shutdown(); }
+ }
+ }
+
+ @Test
+ public void testRestartWithAppend() throws IOException {
+ final Configuration conf = new HdfsConfiguration();
+ // Turn off persistent IPC, so that the DFSClient can survive NN restart
+ conf.setInt(
+ CommonConfigurationKeysPublic.IPC_CLIENT_CONNECTION_MAXIDLETIME_KEY,
+ 0);
+ conf.setBoolean(DFSConfigKeys.DFS_PERSIST_BLOCKS_KEY, true);
+ MiniDFSCluster cluster = null;
+
+ FSDataOutputStream stream;
+ try {
+ cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+ FileSystem fs = cluster.getFileSystem();
+ NameNode.getAddress(conf).getPort();
+ // Creating a file with 4096 blockSize to write multiple blocks
+ stream = fs.create(FILE_PATH, true, BLOCK_SIZE, (short) 1, BLOCK_SIZE);
+ stream.write(DATA_BEFORE_RESTART, 0, DATA_BEFORE_RESTART.length / 2);
+ stream.close();
+ stream = fs.append(FILE_PATH, BLOCK_SIZE);
+ stream.write(DATA_BEFORE_RESTART, DATA_BEFORE_RESTART.length / 2,
+ DATA_BEFORE_RESTART.length / 2);
+ stream.close();
+
+ assertEquals(DATA_BEFORE_RESTART.length,
+ fs.getFileStatus(FILE_PATH).getLen());
+
+ cluster.restartNameNode();
+
+ assertEquals(DATA_BEFORE_RESTART.length,
+ fs.getFileStatus(FILE_PATH).getLen());
+
+ FSDataInputStream readStream = fs.open(FILE_PATH);
+ try {
+ byte[] verifyBuf = new byte[DATA_BEFORE_RESTART.length];
+ IOUtils.readFully(readStream, verifyBuf, 0, verifyBuf.length);
+ assertArrayEquals(DATA_BEFORE_RESTART, verifyBuf);
+ } finally {
+ IOUtils.closeStream(readStream);
+ }
+ } finally {
+ if (cluster != null) { cluster.shutdown(); }
+ }
+ }
+
+ /**
+ * Earlier versions of HDFS didn't persist block allocation to the edit log.
+ * This makes sure that we can still load an edit log when the OP_CLOSE
+ * is the opcode which adds all of the blocks. This is a regression
+ * test for HDFS-2773.
+ * This test uses a tarred pseudo-distributed cluster from Hadoop 1.0
+ * which has a multi-block file. This is similar to the tests in
+ * {@link TestDFSUpgradeFromImage} but none of those images include
+ * a multi-block file.
+ */
+ @Test
+ public void testEarlierVersionEditLog() throws Exception {
+ final Configuration conf = new HdfsConfiguration();
+
+ String tarFile = System.getProperty("test.cache.data", "build/test/cache")
+ + "/" + HADOOP_1_0_MULTIBLOCK_TGZ;
+ String testDir = System.getProperty("test.build.data", "build/test/data");
+ File dfsDir = new File(testDir, "image-1.0");
+ if (dfsDir.exists() && !FileUtil.fullyDelete(dfsDir)) {
+ throw new IOException("Could not delete dfs directory '" + dfsDir + "'");
+ }
+ FileUtil.unTar(new File(tarFile), new File(testDir));
+
+ File nameDir = new File(dfsDir, "name");
+ GenericTestUtils.assertExists(nameDir);
+ File dataDir = new File(dfsDir, "data");
+ GenericTestUtils.assertExists(dataDir);
+
+ conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, nameDir.getAbsolutePath());
+ conf.set(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, dataDir.getAbsolutePath());
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
+ .format(false)
+ .manageDataDfsDirs(false)
+ .manageNameDfsDirs(false)
+ .numDataNodes(1)
+ .startupOption(StartupOption.UPGRADE)
+ .build();
+ try {
+ FileSystem fs = cluster.getFileSystem();
+ Path testPath = new Path("/user/todd/4blocks");
+ // Read it without caring about the actual data within - we just need
+ // to make sure that the block states and locations are OK.
+ DFSTestUtil.readFile(fs, testPath);
+
+ // Ensure that we can append to it - if the blocks were in some funny
+ // state we'd get some kind of issue here.
+ FSDataOutputStream stm = fs.append(testPath);
+ try {
+ stm.write(1);
+ } finally {
+ IOUtils.closeStream(stm);
+ }
+ } finally {
+ cluster.shutdown();
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java
index a488b0a..e211d20 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestReplication.java
@@ -75,7 +75,8 @@
private void checkFile(FileSystem fileSys, Path name, int repl)
throws IOException {
Configuration conf = fileSys.getConf();
- ClientProtocol namenode = DFSUtil.createNamenode(conf);
+ ClientProtocol namenode = NameNodeProxies.createProxy(conf, fileSys.getUri(),
+ ClientProtocol.class).getProxy();
waitForBlockReplication(name.toString(), namenode,
Math.min(numDatanodes, repl), -1);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java
index 4d18e98..c2aaf06 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationToken.java
@@ -20,6 +20,8 @@
+import static org.junit.Assert.*;
+
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
@@ -32,12 +34,16 @@
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
@@ -64,6 +70,7 @@
config.setBoolean(DFSConfigKeys.DFS_WEBHDFS_ENABLED_KEY, true);
config.setLong(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, 10000);
config.setLong(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 5000);
+ config.setBoolean(DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
config.set("hadoop.security.auth_to_local",
"RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT");
FileSystem.setDefaultUri(config, "hdfs://localhost:" + "0");
@@ -71,7 +78,6 @@
cluster.waitActive();
dtSecretManager = NameNodeAdapter.getDtSecretManager(
cluster.getNamesystem());
- dtSecretManager.startThreads();
}
@After
@@ -269,5 +275,51 @@
}
});
}
-
+
+ /**
+ * Test that the delegation token secret manager only runs when the
+ * NN is out of safe mode. This is because the secret manager
+ * has to log to the edit log, which should not be written in
+ * safe mode. Regression test for HDFS-2579.
+ */
+ @Test
+ public void testDTManagerInSafeMode() throws Exception {
+ cluster.startDataNodes(config, 1, true, StartupOption.REGULAR, null);
+ FileSystem fs = cluster.getFileSystem();
+ for (int i = 0; i < 5; i++) {
+ DFSTestUtil.createFile(fs, new Path("/test-" + i), 100, (short)1, 1L);
+ }
+ cluster.getConfiguration(0).setInt(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 500);
+ cluster.getConfiguration(0).setInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000);
+ cluster.setWaitSafeMode(false);
+ cluster.restartNameNode();
+ NameNode nn = cluster.getNameNode();
+ assertTrue(nn.isInSafeMode());
+ DelegationTokenSecretManager sm =
+ NameNodeAdapter.getDtSecretManager(nn.getNamesystem());
+ assertFalse("Secret manager should not run in safe mode", sm.isRunning());
+
+ NameNodeAdapter.leaveSafeMode(nn, false);
+ assertTrue("Secret manager should start when safe mode is exited",
+ sm.isRunning());
+
+ LOG.info("========= entering safemode again");
+
+ NameNodeAdapter.enterSafeMode(nn, false);
+ assertFalse("Secret manager should stop again when safe mode " +
+ "is manually entered", sm.isRunning());
+
+ // Set the cluster to leave safemode quickly on its own.
+ cluster.getConfiguration(0).setInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 0);
+ cluster.setWaitSafeMode(true);
+ cluster.restartNameNode();
+ nn = cluster.getNameNode();
+ sm = NameNodeAdapter.getDtSecretManager(nn.getNamesystem());
+
+ assertFalse(nn.isInSafeMode());
+ assertTrue(sm.isRunning());
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java
index cdad31c..6837f65 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/TestDelegationTokenForProxyUser.java
@@ -48,7 +48,6 @@
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
-import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.web.resources.NamenodeWebHdfsMethods;
import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.hdfs.web.WebHdfsTestUtil;
@@ -114,11 +113,12 @@
DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, 5000);
config.setStrings(ProxyUsers.getProxySuperuserGroupConfKey(REAL_USER),
"group1");
+ config.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
configureSuperUserIPAddresses(config, REAL_USER);
FileSystem.setDefaultUri(config, "hdfs://localhost:" + "0");
cluster = new MiniDFSCluster.Builder(config).build();
cluster.waitActive();
- NameNodeAdapter.getDtSecretManager(cluster.getNamesystem()).startThreads();
ProxyUsers.refreshSuperUserGroupsConfiguration(config);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java
index 61953c8..01725b1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/security/token/block/TestBlockToken.java
@@ -373,7 +373,7 @@
Configuration conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true);
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 512);
- MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(1)
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(1).build();
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
index eb56746..81b03a5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancer.java
@@ -18,9 +18,10 @@
package org.apache.hadoop.hdfs.server.balancer;
import java.io.IOException;
-import java.net.InetSocketAddress;
+import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeoutException;
@@ -37,28 +38,28 @@
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.NameNodeProxies;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
-import org.apache.hadoop.hdfs.server.namenode.NameNode;
/**
* This class tests if a balancer schedules tasks correctly.
*/
public class TestBalancer extends TestCase {
private static final Log LOG = LogFactory.getLog(
- "org.apache.hadoop.hdfs.TestReplication");
+ "org.apache.hadoop.hdfs.TestBalancer");
- final private static long CAPACITY = 500L;
- final private static String RACK0 = "/rack0";
- final private static String RACK1 = "/rack1";
- final private static String RACK2 = "/rack2";
- final static private String fileName = "/tmp.txt";
- final static private Path filePath = new Path(fileName);
+ final static long CAPACITY = 500L;
+ final static String RACK0 = "/rack0";
+ final static String RACK1 = "/rack1";
+ final static String RACK2 = "/rack2";
+ final private static String fileName = "/tmp.txt";
+ final static Path filePath = new Path(fileName);
private MiniDFSCluster cluster;
ClientProtocol client;
@@ -82,9 +83,10 @@
}
/* create a file with a length of <code>fileLen</code> */
- private void createFile(long fileLen, short replicationFactor)
+ static void createFile(MiniDFSCluster cluster, Path filePath, long fileLen,
+ short replicationFactor, int nnIndex)
throws IOException {
- FileSystem fs = cluster.getFileSystem();
+ FileSystem fs = cluster.getFileSystem(nnIndex);
DFSTestUtil.createFile(fs, filePath, fileLen,
replicationFactor, r.nextLong());
DFSTestUtil.waitReplication(fs, filePath, replicationFactor);
@@ -99,11 +101,12 @@
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numNodes).build();
try {
cluster.waitActive();
- client = DFSUtil.createNamenode(conf);
+ client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(),
+ ClientProtocol.class).getProxy();
short replicationFactor = (short)(numNodes-1);
long fileLen = size/replicationFactor;
- createFile(fileLen, replicationFactor);
+ createFile(cluster , filePath, fileLen, replicationFactor, 0);
List<LocatedBlock> locatedBlocks = client.
getBlockLocations(fileName, 0, fileLen).getLocatedBlocks();
@@ -193,7 +196,8 @@
.simulatedCapacities(capacities)
.build();
cluster.waitActive();
- client = DFSUtil.createNamenode(conf);
+ client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(),
+ ClientProtocol.class).getProxy();
for(int i = 0; i < blocksDN.length; i++)
cluster.injectBlocks(i, Arrays.asList(blocksDN[i]));
@@ -211,7 +215,8 @@
* @throws IOException - if getStats() fails
* @throws TimeoutException
*/
- private void waitForHeartBeat(long expectedUsedSpace, long expectedTotalSpace)
+ static void waitForHeartBeat(long expectedUsedSpace,
+ long expectedTotalSpace, ClientProtocol client, MiniDFSCluster cluster)
throws IOException, TimeoutException {
long timeout = TIMEOUT;
long failtime = (timeout <= 0L) ? Long.MAX_VALUE
@@ -248,7 +253,8 @@
* @throws IOException
* @throws TimeoutException
*/
- private void waitForBalancer(long totalUsedSpace, long totalCapacity)
+ static void waitForBalancer(long totalUsedSpace, long totalCapacity,
+ ClientProtocol client, MiniDFSCluster cluster)
throws IOException, TimeoutException {
long timeout = TIMEOUT;
long failtime = (timeout <= 0L) ? Long.MAX_VALUE
@@ -305,13 +311,15 @@
.build();
try {
cluster.waitActive();
- client = DFSUtil.createNamenode(conf);
+ client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(),
+ ClientProtocol.class).getProxy();
long totalCapacity = sum(capacities);
// fill up the cluster to be 30% full
long totalUsedSpace = totalCapacity*3/10;
- createFile(totalUsedSpace/numOfDatanodes, (short)numOfDatanodes);
+ createFile(cluster, filePath, totalUsedSpace / numOfDatanodes,
+ (short) numOfDatanodes, 0);
// start up an empty node with the same capacity and on the same rack
cluster.startDataNodes(conf, 1, true, null,
new String[]{newRack}, new long[]{newCapacity});
@@ -327,17 +335,16 @@
private void runBalancer(Configuration conf,
long totalUsedSpace, long totalCapacity) throws Exception {
- waitForHeartBeat(totalUsedSpace, totalCapacity);
+ waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster);
// start rebalancing
- final List<InetSocketAddress> namenodes =new ArrayList<InetSocketAddress>();
- namenodes.add(NameNode.getServiceAddress(conf, true));
+ Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, conf);
assertEquals(Balancer.ReturnStatus.SUCCESS.code, r);
- waitForHeartBeat(totalUsedSpace, totalCapacity);
+ waitForHeartBeat(totalUsedSpace, totalCapacity, client, cluster);
LOG.info("Rebalancing with default ctor.");
- waitForBalancer(totalUsedSpace, totalCapacity);
+ waitForBalancer(totalUsedSpace, totalCapacity, client, cluster);
}
/** one-node cluster test*/
@@ -396,13 +403,15 @@
.build();
try {
cluster.waitActive();
- client = DFSUtil.createNamenode(conf);
+ client = NameNodeProxies.createProxy(conf, cluster.getFileSystem(0).getUri(),
+ ClientProtocol.class).getProxy();
long totalCapacity = sum(capacities);
// fill up the cluster to be 30% full
long totalUsedSpace = totalCapacity * 3 / 10;
- createFile(totalUsedSpace / numOfDatanodes, (short) numOfDatanodes);
+ createFile(cluster, filePath, totalUsedSpace / numOfDatanodes,
+ (short) numOfDatanodes, 0);
// start up an empty node with the same capacity and on the same rack
cluster.startDataNodes(conf, 1, true, null, new String[] { newRack },
new long[] { newCapacity });
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java
new file mode 100644
index 0000000..9d13a2b
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithHANameNodes.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.balancer;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.net.URI;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.NameNodeProxies;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf;
+import org.apache.hadoop.hdfs.protocol.ClientProtocol;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
+import org.junit.Test;
+
+/**
+ * Test balancer with HA NameNodes
+ */
+public class TestBalancerWithHANameNodes {
+ private MiniDFSCluster cluster;
+ ClientProtocol client;
+
+ static {
+ Balancer.setBlockMoveWaitTime(1000L);
+ }
+
+ /**
+ * Test a cluster with even distribution, then a new empty node is added to
+ * the cluster. Test start a cluster with specified number of nodes, and fills
+ * it to be 30% full (with a single file replicated identically to all
+ * datanodes); It then adds one new empty node and starts balancing.
+ */
+ @Test(timeout = 60000)
+ public void testBalancerWithHANameNodes() throws Exception {
+ Configuration conf = new HdfsConfiguration();
+ TestBalancer.initConf(conf);
+ long newNodeCapacity = TestBalancer.CAPACITY; // new node's capacity
+ String newNodeRack = TestBalancer.RACK2; // new node's rack
+ // array of racks for original nodes in cluster
+ String[] racks = new String[] { TestBalancer.RACK0, TestBalancer.RACK1 };
+ // array of capacities of original nodes in cluster
+ long[] capacities = new long[] { TestBalancer.CAPACITY,
+ TestBalancer.CAPACITY };
+ assertEquals(capacities.length, racks.length);
+ int numOfDatanodes = capacities.length;
+ NNConf nn1Conf = new MiniDFSNNTopology.NNConf("nn1");
+ nn1Conf.setIpcPort(NameNode.DEFAULT_PORT);
+ Configuration copiedConf = new Configuration(conf);
+ cluster = new MiniDFSCluster.Builder(copiedConf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(capacities.length)
+ .racks(racks)
+ .simulatedCapacities(capacities)
+ .build();
+ HATestUtil.setFailoverConfigurations(cluster, conf);
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(1);
+ Thread.sleep(500);
+ client = NameNodeProxies.createProxy(conf, FileSystem.getDefaultUri(conf),
+ ClientProtocol.class).getProxy();
+ long totalCapacity = TestBalancer.sum(capacities);
+ // fill up the cluster to be 30% full
+ long totalUsedSpace = totalCapacity * 3 / 10;
+ TestBalancer.createFile(cluster, TestBalancer.filePath, totalUsedSpace
+ / numOfDatanodes, (short) numOfDatanodes, 1);
+
+ // start up an empty node with the same capacity and on the same rack
+ cluster.startDataNodes(conf, 1, true, null, new String[] { newNodeRack },
+ new long[] { newNodeCapacity });
+ totalCapacity += newNodeCapacity;
+ TestBalancer.waitForHeartBeat(totalUsedSpace, totalCapacity, client,
+ cluster);
+ Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);
+ assertEquals(1, namenodes.size());
+ assertTrue(namenodes.contains(HATestUtil.getLogicalUri(cluster)));
+ final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, conf);
+ assertEquals(Balancer.ReturnStatus.SUCCESS.code, r);
+ TestBalancer.waitForBalancer(totalUsedSpace, totalCapacity, client,
+ cluster);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java
index 6d06da4..b130e02 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithMultipleNameNodes.java
@@ -18,8 +18,9 @@
package org.apache.hadoop.hdfs.server.balancer;
import java.io.IOException;
-import java.net.InetSocketAddress;
+import java.net.URI;
import java.util.Arrays;
+import java.util.Collection;
import java.util.List;
import java.util.Random;
@@ -34,12 +35,13 @@
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ClientProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
-import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
@@ -155,7 +157,7 @@
LOG.info("BALANCER 1");
// start rebalancing
- final List<InetSocketAddress> namenodes = DFSUtil.getNNServiceRpcAddresses(s.conf);
+ final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(s.conf);
final int r = Balancer.run(namenodes, Balancer.Parameters.DEFALUT, s.conf);
Assert.assertEquals(Balancer.ReturnStatus.SUCCESS.code, r);
@@ -249,8 +251,9 @@
final ExtendedBlock[][] blocks;
{
LOG.info("UNEVEN 1");
- final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
- .numNameNodes(nNameNodes)
+ final MiniDFSCluster cluster = new MiniDFSCluster
+ .Builder(new Configuration(conf))
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
.numDataNodes(nDataNodes)
.racks(racks)
.simulatedCapacities(capacities)
@@ -258,6 +261,7 @@
LOG.info("UNEVEN 2");
try {
cluster.waitActive();
+ DFSTestUtil.setFederatedConfiguration(cluster, conf);
LOG.info("UNEVEN 3");
final Suite s = new Suite(cluster, nNameNodes, nDataNodes, conf);
blocks = generateBlocks(s, usedSpacePerNN);
@@ -271,7 +275,7 @@
{
LOG.info("UNEVEN 10");
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
- .numNameNodes(nNameNodes)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes))
.numDataNodes(nDataNodes)
.racks(racks)
.simulatedCapacities(capacities)
@@ -324,13 +328,15 @@
Assert.assertEquals(nDataNodes, racks.length);
LOG.info("RUN_TEST -1");
- final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
- .numNameNodes(nNameNodes)
+ final MiniDFSCluster cluster = new MiniDFSCluster
+ .Builder(new Configuration(conf))
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes))
.numDataNodes(nDataNodes)
.racks(racks)
.simulatedCapacities(capacities)
.build();
LOG.info("RUN_TEST 0");
+ DFSTestUtil.setFederatedConfiguration(cluster, conf);
try {
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
index 8c9b4b3..1ec7511 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManagerTestUtil.java
@@ -24,8 +24,11 @@
import java.util.Set;
import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.util.Daemon;
+import org.junit.Assert;
import com.google.common.base.Preconditions;
@@ -124,6 +127,58 @@
return blockManager.computeDatanodeWork();
}
+ public static int computeInvalidationWork(BlockManager bm) {
+ return bm.computeInvalidateWork(Integer.MAX_VALUE);
+ }
+
+ /**
+ * Compute all the replication and invalidation work for the
+ * given BlockManager.
+ *
+ * This differs from the above functions in that it computes
+ * replication work for all DNs rather than a particular subset,
+ * regardless of invalidation/replication limit configurations.
+ *
+ * NB: you may want to set
+ * {@link DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY} to
+ * a high value to ensure that all work is calculated.
+ */
+ public static int computeAllPendingWork(BlockManager bm)
+ throws IOException {
+ int work = computeInvalidationWork(bm);
+ work += bm.computeReplicationWork(Integer.MAX_VALUE);
+ return work;
+ }
+
+ /**
+ * Ensure that the given NameNode marks the specified DataNode as
+ * entirely dead/expired.
+ * @param nn the NameNode to manipulate
+ * @param dnName the name of the DataNode
+ */
+ public static void noticeDeadDatanode(NameNode nn, String dnName) {
+ FSNamesystem namesystem = nn.getNamesystem();
+ namesystem.writeLock();
+ try {
+ DatanodeManager dnm = namesystem.getBlockManager().getDatanodeManager();
+ HeartbeatManager hbm = dnm.getHeartbeatManager();
+ DatanodeDescriptor[] dnds = hbm.getDatanodes();
+ DatanodeDescriptor theDND = null;
+ for (DatanodeDescriptor dnd : dnds) {
+ if (dnd.getName().equals(dnName)) {
+ theDND = dnd;
+ }
+ }
+ Assert.assertNotNull("Could not find DN with name: " + dnName, theDND);
+
+ synchronized (hbm) {
+ theDND.setLastUpdate(0);
+ hbm.heartbeatCheck();
+ }
+ } finally {
+ namesystem.writeUnlock();
+ }
+ }
/**
* Change whether the block placement policy will prefer the writer's
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
index c18a5c04..2d7a122 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestHeartbeatHandling.java
@@ -41,7 +41,7 @@
public class TestHeartbeatHandling extends TestCase {
/**
* Test if
- * {@link FSNamesystem#handleHeartbeat(DatanodeRegistration, long, long, long, long, int, int)}
+ * {@link FSNamesystem#handleHeartbeat}
* can pick up replication and/or invalidate requests and observes the max
* limit
*/
@@ -75,7 +75,8 @@
dd.addBlockToBeReplicated(
new Block(i, 0, GenerationStamp.FIRST_VALID_STAMP), ONE_TARGET);
}
- DatanodeCommand[]cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem);
+ DatanodeCommand[] cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd,
+ namesystem).getCommands();
assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction());
assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length);
@@ -85,27 +86,31 @@
blockList.add(new Block(i, 0, GenerationStamp.FIRST_VALID_STAMP));
}
dd.addBlocksToBeInvalidated(blockList);
- cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem);
+ cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem)
+ .getCommands();
assertEquals(2, cmds.length);
assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction());
assertEquals(MAX_REPLICATE_LIMIT, ((BlockCommand)cmds[0]).getBlocks().length);
assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction());
assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand)cmds[1]).getBlocks().length);
- cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem);
+ cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem)
+ .getCommands();
assertEquals(2, cmds.length);
assertEquals(DatanodeProtocol.DNA_TRANSFER, cmds[0].getAction());
assertEquals(REMAINING_BLOCKS, ((BlockCommand)cmds[0]).getBlocks().length);
assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[1].getAction());
assertEquals(MAX_INVALIDATE_LIMIT, ((BlockCommand)cmds[1]).getBlocks().length);
- cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem);
+ cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem)
+ .getCommands();
assertEquals(1, cmds.length);
assertEquals(DatanodeProtocol.DNA_INVALIDATE, cmds[0].getAction());
assertEquals(REMAINING_BLOCKS, ((BlockCommand)cmds[0]).getBlocks().length);
- cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem);
- assertEquals(null, cmds);
+ cmds = NameNodeAdapter.sendHeartBeat(nodeReg, dd, namesystem)
+ .getCommands();
+ assertEquals(0, cmds.length);
}
} finally {
namesystem.writeUnlock();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java
index 986ca13..d47f110 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestNodeCount.java
@@ -81,15 +81,8 @@
DataNodeProperties dnprop = cluster.stopDataNode(datanode.getName());
// make sure that NN detects that the datanode is down
- try {
- namesystem.writeLock();
- synchronized (hm) {
- datanode.setLastUpdate(0); // mark it dead
- hm.heartbeatCheck();
- }
- } finally {
- namesystem.writeUnlock();
- }
+ BlockManagerTestUtil.noticeDeadDatanode(
+ cluster.getNameNode(), datanode.getName());
// the block will be replicated
DFSTestUtil.waitReplication(fs, FILE_PATH, REPLICATION_FACTOR);
@@ -121,16 +114,8 @@
// bring down non excessive datanode
dnprop = cluster.stopDataNode(nonExcessDN.getName());
// make sure that NN detects that the datanode is down
-
- try {
- namesystem.writeLock();
- synchronized(hm) {
- nonExcessDN.setLastUpdate(0); // mark it dead
- hm.heartbeatCheck();
- }
- } finally {
- namesystem.writeUnlock();
- }
+ BlockManagerTestUtil.noticeDeadDatanode(
+ cluster.getNameNode(), nonExcessDN.getName());
// The block should be replicated
initializeTimeout(TIMEOUT);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java
new file mode 100644
index 0000000..16977bb
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingDataNodeMessages.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.blockmanagement;
+
+import static org.junit.Assert.*;
+
+import java.util.Queue;
+
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
+import org.junit.Test;
+
+import com.google.common.base.Joiner;
+
+
+public class TestPendingDataNodeMessages {
+ PendingDataNodeMessages msgs = new PendingDataNodeMessages();
+
+ private final Block block1Gs1 = new Block(1, 0, 1);
+ private final Block block1Gs2 = new Block(1, 0, 2);
+ private final Block block1Gs2DifferentInstance =
+ new Block(1, 0, 2);
+ private final Block block2Gs1 = new Block(2, 0, 1);
+
+ private final DatanodeDescriptor fakeDN = new DatanodeDescriptor(
+ new DatanodeID("fake"));
+
+ @Test
+ public void testQueues() {
+ msgs.enqueueReportedBlock(fakeDN, block1Gs1, ReplicaState.FINALIZED);
+ msgs.enqueueReportedBlock(fakeDN, block1Gs2, ReplicaState.FINALIZED);
+
+ assertEquals(2, msgs.count());
+
+ // Nothing queued yet for block 2
+ assertNull(msgs.takeBlockQueue(block2Gs1));
+ assertEquals(2, msgs.count());
+
+ Queue<ReportedBlockInfo> q =
+ msgs.takeBlockQueue(block1Gs2DifferentInstance);
+ assertEquals(
+ "ReportedBlockInfo [block=blk_1_1, dn=fake, reportedState=FINALIZED]," +
+ "ReportedBlockInfo [block=blk_1_2, dn=fake, reportedState=FINALIZED]",
+ Joiner.on(",").join(q));
+ assertEquals(0, msgs.count());
+
+ // Should be null if we pull again
+ assertNull(msgs.takeBlockQueue(block1Gs1));
+ assertEquals(0, msgs.count());
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java
index fb015a2..6ab878c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeAdapter.java
@@ -50,6 +50,29 @@
boolean heartbeatsDisabledForTests) {
dn.setHeartbeatsDisabledForTests(heartbeatsDisabledForTests);
}
+
+ public static void triggerDeletionReport(DataNode dn) throws IOException {
+ for (BPOfferService bpos : dn.getAllBpOs()) {
+ bpos.triggerDeletionReportForTests();
+ }
+ }
+
+ public static void triggerHeartbeat(DataNode dn) throws IOException {
+ for (BPOfferService bpos : dn.getAllBpOs()) {
+ bpos.triggerHeartbeatForTests();
+ }
+ }
+
+ public static void triggerBlockReport(DataNode dn) throws IOException {
+ for (BPOfferService bpos : dn.getAllBpOs()) {
+ bpos.triggerBlockReportForTests();
+ }
+ }
+
+ public static long getPendingAsyncDeletions(DataNode dn) {
+ FSDataset fsd = (FSDataset)dn.getFSDataset();
+ return fsd.asyncDiskService.countPendingDeletions();
+ }
/**
* Insert a Mockito spy object between the given DataNode and
@@ -69,10 +92,20 @@
}
Preconditions.checkArgument(bpos != null,
"No such bpid: %s", bpid);
+
+ BPServiceActor bpsa = null;
+ for (BPServiceActor thisBpsa : bpos.getBPServiceActors()) {
+ if (thisBpsa.getNNSocketAddress().equals(nn.getServiceRpcAddress())) {
+ bpsa = thisBpsa;
+ break;
+ }
+ }
+ Preconditions.checkArgument(bpsa != null,
+ "No service actor to NN at %s", nn.getServiceRpcAddress());
- DatanodeProtocolClientSideTranslatorPB origNN = bpos.getBpNamenode();
+ DatanodeProtocolClientSideTranslatorPB origNN = bpsa.getNameNodeProxy();
DatanodeProtocolClientSideTranslatorPB spy = Mockito.spy(origNN);
- bpos.setBpNamenode(spy);
+ bpsa.setNameNode(spy);
return spy;
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java
new file mode 100644
index 0000000..41e7c8b
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java
@@ -0,0 +1,373 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.datanode;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetrics;
+import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat.State;
+import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
+import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
+import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
+import org.apache.hadoop.hdfs.server.protocol.StorageReport;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+
+import com.google.common.base.Supplier;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class TestBPOfferService {
+
+ private static final String FAKE_BPID = "fake bpid";
+ private static final String FAKE_CLUSTERID = "fake cluster";
+ protected static final Log LOG = LogFactory.getLog(
+ TestBPOfferService.class);
+ private static final ExtendedBlock FAKE_BLOCK =
+ new ExtendedBlock(FAKE_BPID, 12345L);
+
+ static {
+ ((Log4JLogger)DataNode.LOG).getLogger().setLevel(Level.ALL);
+ }
+
+ private DatanodeProtocolClientSideTranslatorPB mockNN1;
+ private DatanodeProtocolClientSideTranslatorPB mockNN2;
+ private NNHAStatusHeartbeat[] mockHaStatuses = new NNHAStatusHeartbeat[2];
+ private int heartbeatCounts[] = new int[2];
+ private DataNode mockDn;
+ private FSDatasetInterface mockFSDataset;
+
+ @Before
+ public void setupMocks() throws Exception {
+ mockNN1 = setupNNMock(0);
+ mockNN2 = setupNNMock(1);
+
+ // Set up a mock DN with the bare-bones configuration
+ // objects, etc.
+ mockDn = Mockito.mock(DataNode.class);
+ Mockito.doReturn(true).when(mockDn).shouldRun();
+ Configuration conf = new Configuration();
+ Mockito.doReturn(conf).when(mockDn).getConf();
+ Mockito.doReturn(new DNConf(conf)).when(mockDn).getDnConf();
+ Mockito.doReturn(DataNodeMetrics.create(conf, "fake dn"))
+ .when(mockDn).getMetrics();
+
+ // Set up a simulated dataset with our fake BP
+ mockFSDataset = Mockito.spy(new SimulatedFSDataset(null, null, conf));
+ mockFSDataset.addBlockPool(FAKE_BPID, conf);
+
+ // Wire the dataset to the DN.
+ Mockito.doReturn(mockFSDataset).when(mockDn).getFSDataset();
+ }
+
+ /**
+ * Set up a mock NN with the bare minimum for a DN to register to it.
+ */
+ private DatanodeProtocolClientSideTranslatorPB setupNNMock(int nnIdx)
+ throws Exception {
+ DatanodeProtocolClientSideTranslatorPB mock =
+ Mockito.mock(DatanodeProtocolClientSideTranslatorPB.class);
+ Mockito.doReturn(
+ new NamespaceInfo(1, FAKE_CLUSTERID, FAKE_BPID,
+ 0, HdfsConstants.LAYOUT_VERSION))
+ .when(mock).versionRequest();
+
+ Mockito.doReturn(new DatanodeRegistration("fake-node"))
+ .when(mock).registerDatanode(Mockito.any(DatanodeRegistration.class),
+ Mockito.any(DatanodeStorage[].class));
+
+ Mockito.doAnswer(new HeartbeatAnswer(nnIdx))
+ .when(mock).sendHeartbeat(
+ Mockito.any(DatanodeRegistration.class),
+ Mockito.any(StorageReport[].class),
+ Mockito.anyInt(),
+ Mockito.anyInt(),
+ Mockito.anyInt());
+ mockHaStatuses[nnIdx] = new NNHAStatusHeartbeat(State.STANDBY, 0);
+ return mock;
+ }
+
+ /**
+ * Mock answer for heartbeats which returns an empty set of commands
+ * and the HA status for the chosen NN from the
+ * {@link TestBPOfferService#mockHaStatuses} array.
+ */
+ private class HeartbeatAnswer implements Answer<HeartbeatResponse> {
+ private final int nnIdx;
+
+ public HeartbeatAnswer(int nnIdx) {
+ this.nnIdx = nnIdx;
+ }
+
+ @Override
+ public HeartbeatResponse answer(InvocationOnMock invocation) throws Throwable {
+ heartbeatCounts[nnIdx]++;
+ return new HeartbeatResponse(new DatanodeCommand[0],
+ mockHaStatuses[nnIdx]);
+ }
+ }
+
+
+ /**
+ * Test that the BPOS can register to talk to two different NNs,
+ * sends block reports to both, etc.
+ */
+ @Test
+ public void testBasicFunctionality() throws Exception {
+ BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2);
+ bpos.start();
+ try {
+ waitForInitialization(bpos);
+
+ // The DN should have register to both NNs.
+ Mockito.verify(mockNN1).registerDatanode(Mockito.any(DatanodeRegistration.class),
+ Mockito.any(DatanodeStorage[].class));
+ Mockito.verify(mockNN2).registerDatanode(Mockito.any(DatanodeRegistration.class),
+ Mockito.any(DatanodeStorage[].class));
+
+ // Should get block reports from both NNs
+ waitForBlockReport(mockNN1);
+ waitForBlockReport(mockNN2);
+
+ // When we receive a block, it should report it to both NNs
+ bpos.notifyNamenodeReceivedBlock(FAKE_BLOCK, "");
+
+ ReceivedDeletedBlockInfo[] ret = waitForBlockReceived(FAKE_BLOCK, mockNN1);
+ assertEquals(1, ret.length);
+ assertEquals(FAKE_BLOCK.getLocalBlock(), ret[0].getBlock());
+
+ ret = waitForBlockReceived(FAKE_BLOCK, mockNN2);
+ assertEquals(1, ret.length);
+ assertEquals(FAKE_BLOCK.getLocalBlock(), ret[0].getBlock());
+
+ } finally {
+ bpos.stop();
+ }
+ }
+
+ /**
+ * Test that DNA_INVALIDATE commands from the standby are ignored.
+ */
+ @Test
+ public void testIgnoreDeletionsFromNonActive() throws Exception {
+ BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2);
+
+ // Ask to invalidate FAKE_BLOCK when block report hits the
+ // standby
+ Mockito.doReturn(new BlockCommand(DatanodeProtocol.DNA_INVALIDATE,
+ FAKE_BPID, new Block[] { FAKE_BLOCK.getLocalBlock() }))
+ .when(mockNN2).blockReport(
+ Mockito.<DatanodeRegistration>anyObject(),
+ Mockito.eq(FAKE_BPID),
+ Mockito.<StorageBlockReport[]>anyObject());
+
+ bpos.start();
+ try {
+ waitForInitialization(bpos);
+
+ // Should get block reports from both NNs
+ waitForBlockReport(mockNN1);
+ waitForBlockReport(mockNN2);
+
+ } finally {
+ bpos.stop();
+ }
+
+ // Should ignore the delete command from the standby
+ Mockito.verify(mockFSDataset, Mockito.never())
+ .invalidate(Mockito.eq(FAKE_BPID),
+ (Block[]) Mockito.anyObject());
+ }
+
+ /**
+ * Ensure that, if the two NNs configured for a block pool
+ * have different block pool IDs, they will refuse to both
+ * register.
+ */
+ @Test
+ public void testNNsFromDifferentClusters() throws Exception {
+ Mockito.doReturn(
+ new NamespaceInfo(1, "fake foreign cluster", FAKE_BPID,
+ 0, HdfsConstants.LAYOUT_VERSION))
+ .when(mockNN1).versionRequest();
+
+ BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2);
+ bpos.start();
+ try {
+ waitForOneToFail(bpos);
+ } finally {
+ bpos.stop();
+ }
+ }
+
+ /**
+ * Test that the DataNode determines the active NameNode correctly
+ * based on the HA-related information in heartbeat responses.
+ * See HDFS-2627.
+ */
+ @Test
+ public void testPickActiveNameNode() throws Exception {
+ BPOfferService bpos = setupBPOSForNNs(mockNN1, mockNN2);
+ bpos.start();
+ try {
+ waitForInitialization(bpos);
+
+ // Should start with neither NN as active.
+ assertNull(bpos.getActiveNN());
+
+ // Have NN1 claim active at txid 1
+ mockHaStatuses[0] = new NNHAStatusHeartbeat(State.ACTIVE, 1);
+ bpos.triggerHeartbeatForTests();
+ assertSame(mockNN1, bpos.getActiveNN());
+
+ // NN2 claims active at a higher txid
+ mockHaStatuses[1] = new NNHAStatusHeartbeat(State.ACTIVE, 2);
+ bpos.triggerHeartbeatForTests();
+ assertSame(mockNN2, bpos.getActiveNN());
+
+ // Even after another heartbeat from the first NN, it should
+ // think NN2 is active, since it claimed a higher txid
+ bpos.triggerHeartbeatForTests();
+ assertSame(mockNN2, bpos.getActiveNN());
+
+ // Even if NN2 goes to standby, DN shouldn't reset to talking to NN1,
+ // because NN1's txid is lower than the last active txid. Instead,
+ // it should consider neither active.
+ mockHaStatuses[1] = new NNHAStatusHeartbeat(State.STANDBY, 2);
+ bpos.triggerHeartbeatForTests();
+ assertNull(bpos.getActiveNN());
+
+ // Now if NN1 goes back to a higher txid, it should be considered active
+ mockHaStatuses[0] = new NNHAStatusHeartbeat(State.ACTIVE, 3);
+ bpos.triggerHeartbeatForTests();
+ assertSame(mockNN1, bpos.getActiveNN());
+
+ } finally {
+ bpos.stop();
+ }
+ }
+
+ private void waitForOneToFail(final BPOfferService bpos)
+ throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ return bpos.countNameNodes() == 1;
+ }
+ }, 100, 10000);
+ }
+
+ /**
+ * Create a BPOfferService which registers with and heartbeats with the
+ * specified namenode proxy objects.
+ * @throws IOException
+ */
+ private BPOfferService setupBPOSForNNs(
+ DatanodeProtocolClientSideTranslatorPB ... nns) throws IOException {
+ // Set up some fake InetAddresses, then override the connectToNN
+ // function to return the corresponding proxies.
+
+ final Map<InetSocketAddress, DatanodeProtocolClientSideTranslatorPB> nnMap = Maps.newLinkedHashMap();
+ for (int port = 0; port < nns.length; port++) {
+ nnMap.put(new InetSocketAddress(port), nns[port]);
+ Mockito.doReturn(nns[port]).when(mockDn).connectToNN(
+ Mockito.eq(new InetSocketAddress(port)));
+ }
+
+ return new BPOfferService(Lists.newArrayList(nnMap.keySet()), mockDn);
+ }
+
+ private void waitForInitialization(final BPOfferService bpos)
+ throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ return bpos.isAlive() && bpos.isInitialized();
+ }
+ }, 100, 10000);
+ }
+
+ private void waitForBlockReport(final DatanodeProtocolClientSideTranslatorPB mockNN)
+ throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ try {
+ Mockito.verify(mockNN).blockReport(
+ Mockito.<DatanodeRegistration>anyObject(),
+ Mockito.eq(FAKE_BPID),
+ Mockito.<StorageBlockReport[]>anyObject());
+ return true;
+ } catch (Throwable t) {
+ LOG.info("waiting on block report: " + t.getMessage());
+ return false;
+ }
+ }
+ }, 500, 10000);
+ }
+
+ private ReceivedDeletedBlockInfo[] waitForBlockReceived(
+ ExtendedBlock fakeBlock,
+ DatanodeProtocolClientSideTranslatorPB mockNN) throws Exception {
+ final ArgumentCaptor<StorageReceivedDeletedBlocks[]> captor =
+ ArgumentCaptor.forClass(StorageReceivedDeletedBlocks[].class);
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+
+ @Override
+ public Boolean get() {
+ try {
+ Mockito.verify(mockNN1).blockReceivedAndDeleted(
+ Mockito.<DatanodeRegistration>anyObject(),
+ Mockito.eq(FAKE_BPID),
+ captor.capture());
+ return true;
+ } catch (Throwable t) {
+ return false;
+ }
+ }
+ }, 100, 10000);
+ return captor.getValue()[0].getBlocks();
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java
new file mode 100644
index 0000000..c0301ac
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolManager.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.datanode;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+
+
+public class TestBlockPoolManager {
+ private Log LOG = LogFactory.getLog(TestBlockPoolManager.class);
+ private DataNode mockDN = Mockito.mock(DataNode.class);
+ private BlockPoolManager bpm;
+ private StringBuilder log = new StringBuilder();
+ private int mockIdx = 1;
+
+ @Before
+ public void setupBPM() {
+ bpm = new BlockPoolManager(mockDN){
+
+ @Override
+ protected BPOfferService createBPOS(List<InetSocketAddress> nnAddrs) {
+ final int idx = mockIdx++;
+ doLog("create #" + idx);
+ final BPOfferService bpos = Mockito.mock(BPOfferService.class);
+ Mockito.doReturn("Mock BPOS #" + idx).when(bpos).toString();
+ // Log refreshes
+ try {
+ Mockito.doAnswer(
+ new Answer<Void>() {
+ @Override
+ public Void answer(InvocationOnMock invocation) throws Throwable {
+ doLog("refresh #" + idx);
+ return null;
+ }
+ }).when(bpos).refreshNNList(
+ Mockito.<ArrayList<InetSocketAddress>>any());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ // Log stops
+ Mockito.doAnswer(
+ new Answer<Void>() {
+ @Override
+ public Void answer(InvocationOnMock invocation) throws Throwable {
+ doLog("stop #" + idx);
+ bpm.remove(bpos);
+ return null;
+ }
+ }).when(bpos).stop();
+ return bpos;
+ }
+ };
+ }
+
+ private void doLog(String string) {
+ synchronized(log) {
+ LOG.info(string);
+ log.append(string).append("\n");
+ }
+ }
+
+ @Test
+ public void testSimpleSingleNS() throws Exception {
+ Configuration conf = new Configuration();
+ conf.set(DFSConfigKeys.FS_DEFAULT_NAME_KEY,
+ "hdfs://mock1:8020");
+ bpm.refreshNamenodes(conf);
+ assertEquals("create #1\n", log.toString());
+ }
+
+ @Test
+ public void testFederationRefresh() throws Exception {
+ Configuration conf = new Configuration();
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES,
+ "ns1,ns2");
+ addNN(conf, "ns1", "mock1:8020");
+ addNN(conf, "ns2", "mock1:8020");
+ bpm.refreshNamenodes(conf);
+ assertEquals(
+ "create #1\n" +
+ "create #2\n", log.toString());
+ log.setLength(0);
+
+ // Remove the first NS
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES,
+ "ns1");
+ bpm.refreshNamenodes(conf);
+ assertEquals(
+ "stop #1\n" +
+ "refresh #2\n", log.toString());
+ log.setLength(0);
+
+ // Add back an NS -- this creates a new BPOS since the old
+ // one for ns2 should have been previously retired
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES,
+ "ns1,ns2");
+ bpm.refreshNamenodes(conf);
+ assertEquals(
+ "create #3\n" +
+ "refresh #2\n", log.toString());
+ }
+
+ private static void addNN(Configuration conf, String ns, String addr) {
+ String key = DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, ns);
+ conf.set(key, addr);
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java
index cb42441..59a61cf 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java
@@ -42,10 +42,13 @@
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
+import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat.State;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Daemon;
@@ -137,7 +140,9 @@
Mockito.anyInt(),
Mockito.anyInt(),
Mockito.anyInt()))
- .thenReturn(new DatanodeCommand[0]);
+ .thenReturn(new HeartbeatResponse(
+ new DatanodeCommand[0],
+ new NNHAStatusHeartbeat(State.ACTIVE, 1)));
dn = new DataNode(conf, dirs, null) {
@Override
@@ -147,14 +152,8 @@
return namenode;
}
};
- dn.runDatanodeDaemon();
- while (!dn.isDatanodeFullyStarted()) {
- try {
- Thread.sleep(50);
- } catch (InterruptedException e) {
- fail("Interrupted starting DN");
- }
- }
+ // Trigger a heartbeat so that it acknowledges the NN as active.
+ dn.getAllBpOs()[0].triggerHeartbeatForTests();
}
/**
@@ -462,7 +461,7 @@
initReplicaRecovery(any(RecoveringBlock.class));
Daemon d = spyDN.recoverBlocks(initRecoveringBlocks());
d.join();
- DatanodeProtocol dnP = dn.getBPNamenode(POOL_ID);
+ DatanodeProtocol dnP = dn.getActiveNamenodeForBP(POOL_ID);
verify(dnP).commitBlockSynchronization(
block, RECOVERY_ID, 0, true, true, DatanodeID.EMPTY_ARRAY);
}
@@ -519,7 +518,7 @@
} catch (IOException e) {
e.getMessage().startsWith("Cannot recover ");
}
- DatanodeProtocol namenode = dn.getBPNamenode(POOL_ID);
+ DatanodeProtocol namenode = dn.getActiveNamenodeForBP(POOL_ID);
verify(namenode, never()).commitBlockSynchronization(
any(ExtendedBlock.class), anyLong(), anyLong(), anyBoolean(),
anyBoolean(), any(DatanodeID[].class));
@@ -548,7 +547,7 @@
} catch (IOException e) {
e.getMessage().startsWith("Cannot recover ");
}
- DatanodeProtocol namenode = dn.getBPNamenode(POOL_ID);
+ DatanodeProtocol namenode = dn.getActiveNamenodeForBP(POOL_ID);
verify(namenode, never()).commitBlockSynchronization(
any(ExtendedBlock.class), anyLong(), anyLong(), anyBoolean(),
anyBoolean(), any(DatanodeID[].class));
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java
index b7a1017..0faa5b1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeExit.java
@@ -28,6 +28,7 @@
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -36,7 +37,6 @@
* Tests if DataNode process exits if all Block Pool services exit.
*/
public class TestDataNodeExit {
- private static int BASEPORT = 9923;
private static long WAIT_TIME_IN_MILLIS = 10;
Configuration conf;
MiniDFSCluster cluster = null;
@@ -46,8 +46,9 @@
conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
conf.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 100);
- cluster = new MiniDFSCluster.Builder(conf).numNameNodes(3)
- .nameNodePort(BASEPORT).build();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(3))
+ .build();
for (int i = 0; i < 3; i++) {
cluster.waitActive(i);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java
index 4a84ce8..20a16c3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMultipleRegistrations.java
@@ -23,6 +23,8 @@
import static org.junit.Assert.assertNotSame;
import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
@@ -30,6 +32,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
@@ -55,8 +58,9 @@
*/
@Test
public void test2NNRegistration() throws IOException {
- MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2)
- .nameNodePort(9928).build();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .build();
try {
cluster.waitActive();
NameNode nn1 = cluster.getNameNode(0);
@@ -90,23 +94,22 @@
assertEquals("number of volumes is wrong", 2, volInfos.size());
for (BPOfferService bpos : dn.getAllBpOs()) {
- LOG.info("reg: bpid=" + "; name=" + bpos.bpRegistration.name + "; sid="
- + bpos.bpRegistration.storageID + "; nna=" + bpos.nnAddr);
+ LOG.info("BP: " + bpos);
}
BPOfferService bpos1 = dn.getAllBpOs()[0];
BPOfferService bpos2 = dn.getAllBpOs()[1];
// The order of bpos is not guaranteed, so fix the order
- if (bpos1.nnAddr.equals(nn2.getNameNodeAddress())) {
+ if (getNNSocketAddress(bpos1).equals(nn2.getNameNodeAddress())) {
BPOfferService tmp = bpos1;
bpos1 = bpos2;
bpos2 = tmp;
}
- assertEquals("wrong nn address", bpos1.nnAddr,
+ assertEquals("wrong nn address", getNNSocketAddress(bpos1),
nn1.getNameNodeAddress());
- assertEquals("wrong nn address", bpos2.nnAddr,
+ assertEquals("wrong nn address", getNNSocketAddress(bpos2),
nn2.getNameNodeAddress());
assertEquals("wrong bpid", bpos1.getBlockPoolId(), bpid1);
assertEquals("wrong bpid", bpos2.getBlockPoolId(), bpid2);
@@ -120,6 +123,12 @@
cluster.shutdown();
}
}
+
+ private static InetSocketAddress getNNSocketAddress(BPOfferService bpos) {
+ List<BPServiceActor> actors = bpos.getBPServiceActors();
+ assertEquals(1, actors.size());
+ return actors.get(0).getNNSocketAddress();
+ }
/**
* starts single nn and single dn and verifies registration and handshake
@@ -153,15 +162,16 @@
for (BPOfferService bpos : dn.getAllBpOs()) {
LOG.info("reg: bpid=" + "; name=" + bpos.bpRegistration.name + "; sid="
- + bpos.bpRegistration.storageID + "; nna=" + bpos.nnAddr);
+ + bpos.bpRegistration.storageID + "; nna=" +
+ getNNSocketAddress(bpos));
}
// try block report
BPOfferService bpos1 = dn.getAllBpOs()[0];
- bpos1.lastBlockReport = 0;
- bpos1.blockReport();
+ bpos1.triggerBlockReportForTests();
- assertEquals("wrong nn address", bpos1.nnAddr,
+ assertEquals("wrong nn address",
+ getNNSocketAddress(bpos1),
nn1.getNameNodeAddress());
assertEquals("wrong bpid", bpos1.getBlockPoolId(), bpid1);
assertEquals("wrong cid", dn.getClusterId(), cid1);
@@ -179,8 +189,9 @@
@Test
public void testClusterIdMismatch() throws IOException {
- MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2).
- nameNodePort(9928).build();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .build();
try {
cluster.waitActive();
@@ -215,25 +226,27 @@
Configuration conf = new HdfsConfiguration();
// start Federated cluster and add a node.
- MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2).
- nameNodePort(9928).build();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .build();
Assert.assertNotNull(cluster);
Assert.assertEquals("(1)Should be 2 namenodes", 2, cluster.getNumNameNodes());
// add a node
- cluster.addNameNode(conf, 9929);
+ cluster.addNameNode(conf, 0);
Assert.assertEquals("(1)Should be 3 namenodes", 3, cluster.getNumNameNodes());
cluster.shutdown();
// 2. start with Federation flag set
conf = new HdfsConfiguration();
- cluster = new MiniDFSCluster.Builder(conf).federation(true).
- nameNodePort(9928).build();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(1))
+ .build();
Assert.assertNotNull(cluster);
Assert.assertEquals("(2)Should be 1 namenodes", 1, cluster.getNumNameNodes());
// add a node
- cluster.addNameNode(conf, 9929);
+ cluster.addNameNode(conf, 0);
Assert.assertEquals("(2)Should be 2 namenodes", 2, cluster.getNumNameNodes());
cluster.shutdown();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java
index ca9b3dc..dbbaedd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDatanodeRegister.java
@@ -42,7 +42,10 @@
DataNode mockDN = mock(DataNode.class);
Mockito.doReturn(true).when(mockDN).shouldRun();
- BPOfferService bpos = new BPOfferService(INVALID_ADDR, mockDN);
+ BPOfferService mockBPOS = Mockito.mock(BPOfferService.class);
+ Mockito.doReturn(mockDN).when(mockBPOS).getDataNode();
+
+ BPServiceActor actor = new BPServiceActor(INVALID_ADDR, mockBPOS);
NamespaceInfo fakeNSInfo = mock(NamespaceInfo.class);
when(fakeNSInfo.getBuildVersion()).thenReturn("NSBuildVersion");
@@ -50,10 +53,9 @@
mock(DatanodeProtocolClientSideTranslatorPB.class);
when(fakeDNProt.versionRequest()).thenReturn(fakeNSInfo);
- bpos.setNameNode( fakeDNProt );
- bpos.bpNSInfo = fakeNSInfo;
+ actor.setNameNode( fakeDNProt );
try {
- bpos.retrieveNamespaceInfo();
+ actor.retrieveNamespaceInfo();
fail("register() did not throw exception! " +
"Expected: IncorrectVersionException");
} catch (IncorrectVersionException ie) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java
index 0b0ca7b..2ff075c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDeleteBlockPool.java
@@ -31,6 +31,7 @@
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.tools.DFSAdmin;
import org.junit.Test;
@@ -47,8 +48,9 @@
try {
conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES,
"namesServerId1,namesServerId2");
- cluster = new MiniDFSCluster.Builder(conf).federation(true).numNameNodes(
- 2).numDataNodes(2).build();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .numDataNodes(2).build();
cluster.waitActive();
@@ -155,8 +157,9 @@
try {
conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES,
"namesServerId1,namesServerId2");
- cluster = new MiniDFSCluster.Builder(conf).federation(true).numNameNodes(
- 2).numDataNodes(1).build();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .numDataNodes(1).build();
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java
index 6862628..a21cab5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestMulitipleNNDataBlockScanner.java
@@ -30,6 +30,7 @@
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.junit.Test;
@@ -41,12 +42,13 @@
String bpids[] = new String[3];
FileSystem fs[] = new FileSystem[3];
- public void setUp(int port) throws IOException {
+ public void setUp() throws IOException {
conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
conf.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 100);
- cluster = new MiniDFSCluster.Builder(conf).numNameNodes(3).nameNodePort(
- port).build();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(3))
+ .build();
for (int i = 0; i < 3; i++) {
cluster.waitActive(i);
}
@@ -65,7 +67,7 @@
@Test
public void testDataBlockScanner() throws IOException, InterruptedException {
- setUp(9923);
+ setUp();
try {
DataNode dn = cluster.getDataNodes().get(0);
for (int i = 0; i < 3; i++) {
@@ -89,9 +91,10 @@
@Test
public void testBlockScannerAfterRefresh() throws IOException,
InterruptedException {
- setUp(9933);
+ setUp();
try {
- Configuration conf = new HdfsConfiguration(cluster.getConfiguration(0));
+ Configuration dnConf = cluster.getDataNodes().get(0).getConf();
+ Configuration conf = new HdfsConfiguration(dnConf);
StringBuilder namenodesBuilder = new StringBuilder();
String bpidToShutdown = cluster.getNamesystem(2).getBlockPoolId();
@@ -140,7 +143,7 @@
@Test
public void testBlockScannerAfterRestart() throws IOException,
InterruptedException {
- setUp(9943);
+ setUp();
try {
cluster.restartDataNode(0);
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java
index 150f1178..2d6f2103 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestRefreshNamenodes.java
@@ -22,12 +22,18 @@
import java.io.IOException;
import java.net.InetSocketAddress;
+import java.util.Set;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology.NNConf;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology.NSConf;
import org.junit.Test;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Sets;
+
/**
* Tests datanode refresh namenode list functionality.
*/
@@ -43,9 +49,13 @@
Configuration conf = new Configuration();
MiniDFSCluster cluster = null;
try {
- conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "namesServerId1");
- cluster = new MiniDFSCluster.Builder(conf).federation(true).
- numNameNodes(1).nameNodePort(nnPort1).build();
+ MiniDFSNNTopology topology = new MiniDFSNNTopology()
+ .addNameservice(new NSConf("ns1").addNN(
+ new NNConf(null).setIpcPort(nnPort1)))
+ .setFederation(true);
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(topology)
+ .build();
DataNode dn = cluster.getDataNodes().get(0);
assertEquals(1, dn.getAllBpOs().length);
@@ -58,21 +68,24 @@
cluster.addNameNode(conf, nnPort4);
- BPOfferService[] bpoList = dn.getAllBpOs();
// Ensure a BPOfferService in the datanodes corresponds to
// a namenode in the cluster
+ Set<InetSocketAddress> nnAddrsFromCluster = Sets.newHashSet();
for (int i = 0; i < 4; i++) {
- InetSocketAddress addr = cluster.getNameNode(i).getNameNodeAddress();
- boolean found = false;
- for (int j = 0; j < bpoList.length; j++) {
- if (bpoList[j] != null && addr.equals(bpoList[j].nnAddr)) {
- found = true;
- bpoList[j] = null; // Erase the address that matched
- break;
- }
- }
- assertTrue("NameNode address " + addr + " is not found.", found);
+ assertTrue(nnAddrsFromCluster.add(
+ cluster.getNameNode(i).getNameNodeAddress()));
}
+
+ Set<InetSocketAddress> nnAddrsFromDN = Sets.newHashSet();
+ for (BPOfferService bpos : dn.getAllBpOs()) {
+ for (BPServiceActor bpsa : bpos.getBPServiceActors()) {
+ assertTrue(nnAddrsFromDN.add(bpsa.getNNSocketAddress()));
+ }
+ }
+
+ assertEquals("",
+ Joiner.on(",").join(
+ Sets.symmetricDifference(nnAddrsFromCluster, nnAddrsFromDN)));
} finally {
if (cluster != null) {
cluster.shutdown();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java
index 4d09815..7962d4a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/CreateEditsLog.java
@@ -195,7 +195,7 @@
FileNameGenerator nameGenerator = new FileNameGenerator(BASE_PATH, 100);
FSEditLog editLog = FSImageTestUtil.createStandaloneEditLog(editsLogDir);
- editLog.open();
+ editLog.openForWrite();
addFiles(editLog, numFiles, replication, numBlocksPerFile, startingBlockId,
nameGenerator);
editLog.logSync();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java
index b6c69c3..6e9aa8c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/FSImageTestUtil.java
@@ -34,8 +34,11 @@
import java.util.Properties;
import java.util.Set;
+import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
@@ -187,13 +190,36 @@
Mockito.doReturn(sd).when(storage)
.getStorageDirectory(Matchers.<URI>anyObject());
- return new FSEditLog(new Configuration(),
+ FSEditLog editLog = new FSEditLog(new Configuration(),
storage,
ImmutableList.of(logDir.toURI()));
+ editLog.initJournalsForWrite();
+ return editLog;
}
/**
+ * Create an aborted in-progress log in the given directory, containing
+ * only a specified number of "mkdirs" operations.
+ */
+ public static void createAbortedLogWithMkdirs(File editsLogDir, int numDirs,
+ long firstTxId) throws IOException {
+ FSEditLog editLog = FSImageTestUtil.createStandaloneEditLog(editsLogDir);
+ editLog.setNextTxId(firstTxId);
+ editLog.openForWrite();
+
+ PermissionStatus perms = PermissionStatus.createImmutable("fakeuser", "fakegroup",
+ FsPermission.createImmutable((short)0755));
+ for (int i = 1; i <= numDirs; i++) {
+ String dirName = "dir" + i;
+ INodeDirectory dir = new INodeDirectory(dirName, perms);
+ editLog.logMkDir("/" + dirName, dir);
+ }
+ editLog.logSync();
+ editLog.abortCurrentLogSegment();
+ }
+
+ /**
* @param editLog a path of an edit log file
* @return the count of each type of operation in the log file
* @throws Exception if there is an error reading it
@@ -410,13 +436,20 @@
* Assert that the NameNode has checkpoints at the expected
* transaction IDs.
*/
- static void assertNNHasCheckpoints(MiniDFSCluster cluster,
+ public static void assertNNHasCheckpoints(MiniDFSCluster cluster,
List<Integer> txids) {
+ assertNNHasCheckpoints(cluster, 0, txids);
+ }
+
+ public static void assertNNHasCheckpoints(MiniDFSCluster cluster,
+ int nnIdx, List<Integer> txids) {
- for (File nameDir : getNameNodeCurrentDirs(cluster)) {
+ for (File nameDir : getNameNodeCurrentDirs(cluster, nnIdx)) {
LOG.info("examining name dir with files: " +
Joiner.on(",").join(nameDir.listFiles()));
// Should have fsimage_N for the three checkpoints
+ LOG.info("Examining storage dir " + nameDir + " with contents: "
+ + StringUtils.join(nameDir.listFiles(), ", "));
for (long checkpointTxId : txids) {
File image = new File(nameDir,
NNStorage.getImageFileName(checkpointTxId));
@@ -425,9 +458,9 @@
}
}
- public static List<File> getNameNodeCurrentDirs(MiniDFSCluster cluster) {
+ public static List<File> getNameNodeCurrentDirs(MiniDFSCluster cluster, int nnIdx) {
List<File> nameDirs = Lists.newArrayList();
- for (URI u : cluster.getNameDirs(0)) {
+ for (URI u : cluster.getNameDirs(nnIdx)) {
nameDirs.add(new File(u.getPath(), "current"));
}
return nameDirs;
@@ -441,7 +474,7 @@
throws IOException {
File currentDir = sd.getCurrentDir();
List<EditLogFile> foundEditLogs
- = Lists.newArrayList(FileJournalManager.matchEditLogs(currentDir.listFiles()));
+ = Lists.newArrayList(FileJournalManager.matchEditLogs(currentDir));
return Collections.max(foundEditLogs, EditLogFile.COMPARE_BY_START_TXID);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java
index d128167..7f18811 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NNThroughputBenchmark.java
@@ -80,7 +80,7 @@
* <li>-logLevel L specifies the logging level when the benchmark runs.
* The default logging level is {@link Level#ERROR}.</li>
* <li>-UGCacheRefreshCount G will cause the benchmark to call
- * {@link NameNode#refreshUserToGroupsMappings()} after
+ * {@link NameNodeRpcServer#refreshUserToGroupsMappings} after
* every G operations, which purges the name-node's user group cache.
* By default the refresh is never called.</li>
* <li>-keepResults do not clean up the name-space after execution.</li>
@@ -813,7 +813,7 @@
StorageReport[] rep = { new StorageReport(dnRegistration.getStorageID(),
false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) };
DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration,
- rep, 0, 0, 0);
+ rep, 0, 0, 0).getCommands();
if(cmds != null) {
for (DatanodeCommand cmd : cmds ) {
if(LOG.isDebugEnabled()) {
@@ -859,7 +859,7 @@
StorageReport[] rep = { new StorageReport(dnRegistration.getStorageID(),
false, DF_CAPACITY, DF_USED, DF_CAPACITY - DF_USED, DF_USED) };
DatanodeCommand[] cmds = nameNodeProto.sendHeartbeat(dnRegistration,
- rep, 0, 0, 0);
+ rep, 0, 0, 0).getCommands();
if (cmds != null) {
for (DatanodeCommand cmd : cmds) {
if (cmd.getAction() == DatanodeProtocol.DNA_TRANSFER) {
@@ -889,8 +889,10 @@
receivedDNReg.setStorageInfo(
new DataStorage(nsInfo, dnInfo.getStorageID()));
receivedDNReg.setInfoPort(dnInfo.getInfoPort());
- ReceivedDeletedBlockInfo[] rdBlocks = { new ReceivedDeletedBlockInfo(
- blocks[i], DataNode.EMPTY_DEL_HINT) };
+ ReceivedDeletedBlockInfo[] rdBlocks = {
+ new ReceivedDeletedBlockInfo(
+ blocks[i], ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK,
+ null) };
StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks(
receivedDNReg.getStorageID(), rdBlocks) };
nameNodeProto.blockReceivedAndDeleted(receivedDNReg, nameNode
@@ -1007,7 +1009,8 @@
int dnIdx = Arrays.binarySearch(datanodes, dnInfo.getName());
datanodes[dnIdx].addBlock(loc.getBlock().getLocalBlock());
ReceivedDeletedBlockInfo[] rdBlocks = { new ReceivedDeletedBlockInfo(
- loc.getBlock().getLocalBlock(), "") };
+ loc.getBlock().getLocalBlock(),
+ ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK, null) };
StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks(
datanodes[dnIdx].dnRegistration.getStorageID(), rdBlocks) };
nameNodeProto.blockReceivedAndDeleted(datanodes[dnIdx].dnRegistration, loc
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
index fb1fc6b..fead3b6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/NameNodeAdapter.java
@@ -17,16 +17,27 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
+import java.io.File;
import java.io.IOException;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import org.apache.hadoop.fs.UnresolvedLinkException;
+import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
-import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.SafeModeInfo;
+import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.ipc.Server;
+import org.apache.hadoop.ipc.StandbyException;
+import org.apache.hadoop.security.AccessControlException;
+import org.mockito.Mockito;
/**
* This is a utility class to expose NameNode functionality for unit tests.
@@ -48,6 +59,38 @@
src, offset, length, false, true);
}
+ public static HdfsFileStatus getFileInfo(NameNode namenode, String src,
+ boolean resolveLink) throws AccessControlException, UnresolvedLinkException,
+ StandbyException {
+ return namenode.getNamesystem().getFileInfo(src, resolveLink);
+ }
+
+ public static boolean mkdirs(NameNode namenode, String src,
+ PermissionStatus permissions, boolean createParent)
+ throws UnresolvedLinkException, IOException {
+ return namenode.getNamesystem().mkdirs(src, permissions, createParent);
+ }
+
+ public static void saveNamespace(NameNode namenode)
+ throws AccessControlException, IOException {
+ namenode.getNamesystem().saveNamespace();
+ }
+
+ public static void enterSafeMode(NameNode namenode, boolean resourcesLow)
+ throws IOException {
+ namenode.getNamesystem().enterSafeMode(resourcesLow);
+ }
+
+ public static void leaveSafeMode(NameNode namenode, boolean checkForUpgrades)
+ throws SafeModeException {
+ namenode.getNamesystem().leaveSafeMode(checkForUpgrades);
+ }
+
+ public static void abortEditLogs(NameNode nn) {
+ FSEditLog el = nn.getFSImage().getEditLog();
+ el.abortCurrentLogSegment();
+ }
+
/**
* Get the internal RPC server instance.
* @return rpc server
@@ -61,7 +104,7 @@
return ns.getDelegationTokenSecretManager();
}
- public static DatanodeCommand[] sendHeartBeat(DatanodeRegistration nodeReg,
+ public static HeartbeatResponse sendHeartBeat(DatanodeRegistration nodeReg,
DatanodeDescriptor dd, FSNamesystem namesystem) throws IOException {
return namesystem.handleHeartbeat(nodeReg, dd.getCapacity(),
dd.getDfsUsed(), dd.getRemaining(), dd.getBlockPoolUsed(), 0, 0, 0);
@@ -79,7 +122,7 @@
/** Set the softLimit and hardLimit of client lease periods. */
public static void setLeasePeriod(final FSNamesystem namesystem, long soft, long hard) {
getLeaseManager(namesystem).setLeasePeriod(soft, hard);
- namesystem.lmthread.interrupt();
+ namesystem.leaseManager.triggerMonitorCheckNow();
}
public static String getLeaseHolderForPath(NameNode namenode, String path) {
@@ -87,6 +130,19 @@
}
/**
+ * @return the timestamp of the last renewal of the given lease,
+ * or -1 in the case that the lease doesn't exist.
+ */
+ public static long getLeaseRenewalTime(NameNode nn, String path) {
+ LeaseManager lm = nn.getNamesystem().leaseManager;
+ Lease l = lm.getLeaseByPath(path);
+ if (l == null) {
+ return -1;
+ }
+ return l.getLastUpdate();
+ }
+
+ /**
* Return the datanode descriptor for the given datanode.
*/
public static DatanodeDescriptor getDatanode(final FSNamesystem ns,
@@ -100,6 +156,33 @@
}
/**
+ * Return the FSNamesystem stats
+ */
+ public static long[] getStats(final FSNamesystem fsn) {
+ return fsn.getStats();
+ }
+
+ public static ReentrantReadWriteLock spyOnFsLock(FSNamesystem fsn) {
+ ReentrantReadWriteLock spy = Mockito.spy(fsn.getFsLockForTests());
+ fsn.setFsLockForTests(spy);
+ return spy;
+ }
+
+ public static FSImage spyOnFsImage(NameNode nn1) {
+ FSImage spy = Mockito.spy(nn1.getNamesystem().dir.fsImage);
+ nn1.getNamesystem().dir.fsImage = spy;
+ return spy;
+ }
+
+ public static String getMkdirOpPath(FSEditLogOp op) {
+ if (op.opCode == FSEditLogOpCodes.OP_MKDIR) {
+ return ((MkdirOp) op).path;
+ } else {
+ return null;
+ }
+ }
+
+ /**
* @return the number of blocks marked safe by safemode, or -1
* if safemode is not running.
*/
@@ -122,4 +205,8 @@
}
return smi.initializedReplQueues;
}
+
+ public static File getInProgressEditsFile(StorageDirectory sd, long startTxId) {
+ return NNStorage.getInProgressEditsFile(sd, startTxId);
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java
index e22fa29..392cc9d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/OfflineEditsViewerHelper.java
@@ -108,10 +108,11 @@
// for security to work (fake JobTracker user)
config.set("hadoop.security.auth_to_local",
"RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT");
+ config.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
cluster =
new MiniDFSCluster.Builder(config).manageNameDfsDirs(false).build();
cluster.waitClusterUp();
- cluster.getNamesystem().getDelegationTokenSecretManager().startThreads();
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java
index 2d8a115..5d93b8c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestBackupNode.java
@@ -33,6 +33,7 @@
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
@@ -123,6 +124,7 @@
@Test
public void testBackupNodeTailsEdits() throws Exception {
Configuration conf = new HdfsConfiguration();
+ HAUtil.setAllowStandbyReads(conf, true);
MiniDFSCluster cluster = null;
FileSystem fileSys = null;
BackupNode backup = null;
@@ -244,11 +246,12 @@
}
void testCheckpoint(StartupOption op) throws Exception {
- Path file1 = new Path("checkpoint.dat");
- Path file2 = new Path("checkpoint2.dat");
- Path file3 = new Path("backup.dat");
+ Path file1 = new Path("/checkpoint.dat");
+ Path file2 = new Path("/checkpoint2.dat");
+ Path file3 = new Path("/backup.dat");
Configuration conf = new HdfsConfiguration();
+ HAUtil.setAllowStandbyReads(conf, true);
short replication = (short)conf.getInt("dfs.replication", 3);
int numDatanodes = Math.max(3, replication);
conf.set(DFSConfigKeys.DFS_BLOCKREPORT_INITIAL_DELAY_KEY, "0");
@@ -345,11 +348,13 @@
TestCheckpoint.checkFile(fileSys, file3, replication);
// should also be on BN right away
assertTrue("file3 does not exist on BackupNode",
- op != StartupOption.BACKUP || bnFS.exists(file3));
+ op != StartupOption.BACKUP ||
+ backup.getNamesystem().getFileInfo(
+ file3.toUri().getPath(), false) != null);
} catch(IOException e) {
LOG.error("Error in TestBackupNode:", e);
- assertTrue(e.getLocalizedMessage(), false);
+ throw new AssertionError(e);
} finally {
if(backup != null) backup.stop();
if(fileSys != null) fileSys.close();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java
index fbbcfc7..20d4c72 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckPointForSecurityTokens.java
@@ -22,6 +22,7 @@
import java.io.*;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
@@ -57,18 +58,19 @@
}
/**
- * Tests save namepsace.
+ * Tests save namespace.
*/
@Test
public void testSaveNamespace() throws IOException {
DistributedFileSystem fs = null;
try {
Configuration conf = new HdfsConfiguration();
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes).build();
cluster.waitActive();
fs = (DistributedFileSystem)(cluster.getFileSystem());
FSNamesystem namesystem = cluster.getNamesystem();
- namesystem.getDelegationTokenSecretManager().startThreads();
String renewer = UserGroupInformation.getLoginUser().getUserName();
Token<DelegationTokenIdentifier> token1 = namesystem
.getDelegationToken(new Text(renewer));
@@ -122,7 +124,6 @@
}
namesystem = cluster.getNamesystem();
- namesystem.getDelegationTokenSecretManager().startThreads();
Token<DelegationTokenIdentifier> token3 = namesystem
.getDelegationToken(new Text(renewer));
Token<DelegationTokenIdentifier> token4 = namesystem
@@ -136,7 +137,6 @@
cluster.waitActive();
namesystem = cluster.getNamesystem();
- namesystem.getDelegationTokenSecretManager().startThreads();
Token<DelegationTokenIdentifier> token5 = namesystem
.getDelegationToken(new Text(renewer));
@@ -159,7 +159,6 @@
cluster.waitActive();
namesystem = cluster.getNamesystem();
- namesystem.getDelegationTokenSecretManager().startThreads();
try {
renewToken(token1);
cancelToken(token1);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
index 19f4812..daed09b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
@@ -46,6 +46,7 @@
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
@@ -655,6 +656,7 @@
sdToLock.lock();
try {
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .format(false)
.manageNameDfsDirs(false)
.numDataNodes(0)
.build();
@@ -861,7 +863,7 @@
}
/**
- * Tests save namepsace.
+ * Tests save namespace.
*/
public void testSaveNamespace() throws IOException {
MiniDFSCluster cluster = null;
@@ -911,10 +913,12 @@
throw new IOException(e);
}
+ final int EXPECTED_TXNS_FIRST_SEG = 12;
+
// the following steps should have happened:
- // edits_inprogress_1 -> edits_1-8 (finalized)
- // fsimage_8 created
- // edits_inprogress_9 created
+ // edits_inprogress_1 -> edits_1-12 (finalized)
+ // fsimage_12 created
+ // edits_inprogress_13 created
//
for(URI uri : editsDirs) {
File ed = new File(uri.getPath());
@@ -926,19 +930,21 @@
NNStorage.getInProgressEditsFileName(1));
assertFalse(originalEdits.exists());
File finalizedEdits = new File(curDir,
- NNStorage.getFinalizedEditsFileName(1,8));
- assertTrue(finalizedEdits.exists());
+ NNStorage.getFinalizedEditsFileName(1, EXPECTED_TXNS_FIRST_SEG));
+ GenericTestUtils.assertExists(finalizedEdits);
assertTrue(finalizedEdits.length() > Integer.SIZE/Byte.SIZE);
- assertTrue(new File(ed, "current/"
- + NNStorage.getInProgressEditsFileName(9)).exists());
+ GenericTestUtils.assertExists(new File(ed, "current/"
+ + NNStorage.getInProgressEditsFileName(
+ EXPECTED_TXNS_FIRST_SEG + 1)));
}
Collection<URI> imageDirs = cluster.getNameDirs(0);
for (URI uri : imageDirs) {
File imageDir = new File(uri.getPath());
File savedImage = new File(imageDir, "current/"
- + NNStorage.getImageFileName(8));
+ + NNStorage.getImageFileName(
+ EXPECTED_TXNS_FIRST_SEG));
assertTrue("Should have saved image at " + savedImage,
savedImage.exists());
}
@@ -1059,8 +1065,9 @@
String nameserviceId2 = "ns2";
conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, nameserviceId1
+ "," + nameserviceId2);
- MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numNameNodes(2)
- .nameNodePort(9928).build();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(2))
+ .build();
Configuration snConf1 = new HdfsConfiguration(cluster.getConfiguration(0));
Configuration snConf2 = new HdfsConfiguration(cluster.getConfiguration(1));
InetSocketAddress nn1RpcAddress =
@@ -1076,9 +1083,9 @@
snConf2.set(DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, "");
// Set the nameserviceIds
- snConf1.set(DFSUtil.getNameServiceIdKey(
+ snConf1.set(DFSUtil.addKeySuffixes(
DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nameserviceId1), nn1);
- snConf2.set(DFSUtil.getNameServiceIdKey(
+ snConf2.set(DFSUtil.addKeySuffixes(
DFSConfigKeys.DFS_NAMENODE_SERVICE_RPC_ADDRESS_KEY, nameserviceId2), nn2);
SecondaryNameNode secondary1 = startSecondaryNameNode(snConf1);
@@ -1317,17 +1324,11 @@
// Let the first one finish
delayer.proceed();
- // Letting the first node continue should catch an exception
+ // Letting the first node continue, it should try to upload the
+ // same image, and gracefully ignore it, while logging an
+ // error message.
checkpointThread.join();
- try {
- checkpointThread.propagateExceptions();
- fail("Didn't throw!");
- } catch (Exception ioe) {
- assertTrue("Unexpected exception: " +
- StringUtils.stringifyException(ioe),
- ioe.toString().contains("Another checkpointer already uploaded"));
- LOG.info("Caught expected exception", ioe);
- }
+ checkpointThread.propagateExceptions();
// primary should still consider fsimage_4 the latest
assertEquals(4, storage.getMostRecentCheckpointTxId());
@@ -1763,7 +1764,7 @@
private void assertParallelFilesInvariant(MiniDFSCluster cluster,
ImmutableList<SecondaryNameNode> secondaries) throws Exception {
List<File> allCurrentDirs = Lists.newArrayList();
- allCurrentDirs.addAll(getNameNodeCurrentDirs(cluster));
+ allCurrentDirs.addAll(getNameNodeCurrentDirs(cluster, 0));
for (SecondaryNameNode snn : secondaries) {
allCurrentDirs.addAll(getCheckpointCurrentDirs(snn));
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java
index 68dc9f5..98c17a7 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestClusterId.java
@@ -26,6 +26,7 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
+import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
@@ -47,7 +48,7 @@
private String getClusterId(Configuration config) throws IOException {
// see if cluster id not empty.
Collection<URI> dirsToFormat = FSNamesystem.getNamespaceDirs(config);
- Collection<URI> editsToFormat = FSNamesystem.getNamespaceEditsDirs(config);
+ List<URI> editsToFormat = FSNamesystem.getNamespaceEditsDirs(config);
FSImage fsImage = new FSImage(config, dirsToFormat, editsToFormat);
Iterator<StorageDirectory> sdit =
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java
index 301c4d4..82730ea 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestDeadDatanode.java
@@ -110,7 +110,9 @@
DatanodeProtocol dnp = cluster.getNameNodeRpc();
ReceivedDeletedBlockInfo[] blocks = { new ReceivedDeletedBlockInfo(
- new Block(0), "") };
+ new Block(0),
+ ReceivedDeletedBlockInfo.BlockStatus.RECEIVED_BLOCK,
+ null) };
StorageReceivedDeletedBlocks[] storageBlocks = {
new StorageReceivedDeletedBlocks(reg.getStorageID(), blocks) };
@@ -136,7 +138,7 @@
// that asks datanode to register again
StorageReport[] rep = { new StorageReport(reg.getStorageID(), false, 0, 0,
0, 0) };
- DatanodeCommand[] cmd = dnp.sendHeartbeat(reg, rep, 0, 0, 0);
+ DatanodeCommand[] cmd = dnp.sendHeartbeat(reg, rep, 0, 0, 0).getCommands();
Assert.assertEquals(1, cmd.length);
Assert.assertEquals(cmd[0].getAction(), RegisterCommand.REGISTER
.getAction());
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
index b3eeeab..bc41e7b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java
@@ -147,7 +147,7 @@
public void testPreTxIdEditLogNoEdits() throws Exception {
FSNamesystem namesys = Mockito.mock(FSNamesystem.class);
namesys.dir = Mockito.mock(FSDirectory.class);
- int numEdits = testLoad(
+ long numEdits = testLoad(
StringUtils.hexStringToByte("ffffffed"), // just version number
namesys);
assertEquals(0, numEdits);
@@ -166,7 +166,7 @@
cluster.waitActive();
final FSNamesystem namesystem = cluster.getNamesystem();
- int numEdits = testLoad(HADOOP20_SOME_EDITS, namesystem);
+ long numEdits = testLoad(HADOOP20_SOME_EDITS, namesystem);
assertEquals(3, numEdits);
// Sanity check the edit
HdfsFileStatus fileInfo = namesystem.getFileInfo("/myfile", false);
@@ -177,7 +177,7 @@
}
}
- private int testLoad(byte[] data, FSNamesystem namesys) throws IOException {
+ private long testLoad(byte[] data, FSNamesystem namesys) throws IOException {
FSEditLogLoader loader = new FSEditLogLoader(namesys);
return loader.loadFSEdits(new EditLogByteInputStream(data), 1);
}
@@ -321,7 +321,7 @@
assertTrue("Expect " + editFile + " exists", editFile.exists());
System.out.println("Verifying file: " + editFile);
- int numEdits = loader.loadFSEdits(
+ long numEdits = loader.loadFSEdits(
new EditLogFileInputStream(editFile), 3);
int numLeases = namesystem.leaseManager.countLease();
System.out.println("Number of outstanding leases " + numLeases);
@@ -589,7 +589,6 @@
currentDir.getAbsolutePath());
assertNotNull("No image found in " + nameDir, imageFile);
assertEquals(NNStorage.getImageFileName(0), imageFile.getName());
-
// Try to start a new cluster
LOG.info("\n===========================================\n" +
"Starting same cluster after simulated crash");
@@ -636,22 +635,26 @@
}
}
+ // should succeed - only one corrupt log dir
public void testCrashRecoveryEmptyLogOneDir() throws Exception {
- doTestCrashRecoveryEmptyLog(false, true);
+ doTestCrashRecoveryEmptyLog(false, true, true);
}
+ // should fail - seen_txid updated to 3, but no log dir contains txid 3
public void testCrashRecoveryEmptyLogBothDirs() throws Exception {
- doTestCrashRecoveryEmptyLog(true, true);
+ doTestCrashRecoveryEmptyLog(true, true, false);
}
+ // should succeed - only one corrupt log dir
public void testCrashRecoveryEmptyLogOneDirNoUpdateSeenTxId()
throws Exception {
- doTestCrashRecoveryEmptyLog(false, false);
+ doTestCrashRecoveryEmptyLog(false, false, true);
}
+ // should succeed - both log dirs corrupt, but seen_txid never updated
public void testCrashRecoveryEmptyLogBothDirsNoUpdateSeenTxId()
throws Exception {
- doTestCrashRecoveryEmptyLog(true, false);
+ doTestCrashRecoveryEmptyLog(true, false, true);
}
/**
@@ -667,12 +670,13 @@
* NN should fail to start up, because it's aware that txid 3
* was reached, but unable to find a non-corrupt log starting there.
* @param updateTransactionIdFile if true update the seen_txid file.
- * If false, the it will not be updated. This will simulate a case
- * where the NN crashed between creating the new segment and updating
- * seen_txid.
+ * If false, it will not be updated. This will simulate a case where
+ * the NN crashed between creating the new segment and updating the
+ * seen_txid file.
+ * @param shouldSucceed true if the test is expected to succeed.
*/
private void doTestCrashRecoveryEmptyLog(boolean inBothDirs,
- boolean updateTransactionIdFile)
+ boolean updateTransactionIdFile, boolean shouldSucceed)
throws Exception {
// start a cluster
Configuration conf = new HdfsConfiguration();
@@ -691,29 +695,40 @@
// Make a truncated edits_3_inprogress
File log = new File(currentDir,
NNStorage.getInProgressEditsFileName(3));
- NNStorage storage = new NNStorage(conf,
- Collections.<URI>emptyList(),
- Lists.newArrayList(uri));
- if (updateTransactionIdFile) {
- storage.writeTransactionIdFileToStorage(3);
- }
- storage.close();
new EditLogFileOutputStream(log, 1024).create();
if (!inBothDirs) {
break;
}
+
+ NNStorage storage = new NNStorage(conf,
+ Collections.<URI>emptyList(),
+ Lists.newArrayList(uri));
+
+ if (updateTransactionIdFile) {
+ storage.writeTransactionIdFileToStorage(3);
+ }
+ storage.close();
}
try {
cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(NUM_DATA_NODES).format(false).build();
- fail("Did not fail to start with all-corrupt logs");
+ if (!shouldSucceed) {
+ fail("Should not have succeeded in startin cluster");
+ }
} catch (IOException ioe) {
- GenericTestUtils.assertExceptionContains(
- "No non-corrupt logs for txid 3", ioe);
+ if (shouldSucceed) {
+ LOG.info("Should have succeeded in starting cluster, but failed", ioe);
+ throw ioe;
+ } else {
+ GenericTestUtils.assertExceptionContains(
+ "No non-corrupt logs for txid 3",
+ ioe);
+ }
+ } finally {
+ cluster.shutdown();
}
- cluster.shutdown();
}
@@ -781,6 +796,11 @@
public JournalType getType() {
return JournalType.FILE;
}
+
+ @Override
+ public boolean isInProgress() {
+ return true;
+ }
}
public void testFailedOpen() throws Exception {
@@ -789,11 +809,11 @@
FSEditLog log = FSImageTestUtil.createStandaloneEditLog(logDir);
try {
logDir.setWritable(false);
- log.open();
+ log.openForWrite();
fail("Did no throw exception on only having a bad dir");
} catch (IOException ioe) {
GenericTestUtils.assertExceptionContains(
- "no journals successfully started", ioe);
+ "too few journals successfully started", ioe);
} finally {
logDir.setWritable(true);
log.close();
@@ -813,7 +833,7 @@
new byte[500]);
try {
- log.open();
+ log.openForWrite();
NameNodeMetrics mockMetrics = Mockito.mock(NameNodeMetrics.class);
log.setMetricsForTests(mockMetrics);
@@ -848,6 +868,7 @@
"[1,100]|[101,200]|[201,]",
"[1,100]|[101,200]|[201,]");
log = new FSEditLog(storage);
+ log.initJournalsForWrite();
assertEquals("[[1,100], [101,200]]",
log.getEditLogManifest(1).toString());
assertEquals("[[101,200]]",
@@ -859,6 +880,7 @@
"[1,100]|[101,200]",
"[1,100]|[201,300]|[301,400]"); // nothing starting at 101
log = new FSEditLog(storage);
+ log.initJournalsForWrite();
assertEquals("[[1,100], [101,200], [201,300], [301,400]]",
log.getEditLogManifest(1).toString());
@@ -868,6 +890,7 @@
"[1,100]|[301,400]", // gap from 101 to 300
"[301,400]|[401,500]");
log = new FSEditLog(storage);
+ log.initJournalsForWrite();
assertEquals("[[301,400], [401,500]]",
log.getEditLogManifest(1).toString());
@@ -877,6 +900,7 @@
"[1,100]|[101,150]", // short log at 101
"[1,50]|[101,200]"); // short log at 1
log = new FSEditLog(storage);
+ log.initJournalsForWrite();
assertEquals("[[1,100], [101,200]]",
log.getEditLogManifest(1).toString());
assertEquals("[[101,200]]",
@@ -889,6 +913,7 @@
"[1,100]|[101,]",
"[1,100]|[101,200]");
log = new FSEditLog(storage);
+ log.initJournalsForWrite();
assertEquals("[[1,100], [101,200]]",
log.getEditLogManifest(1).toString());
assertEquals("[[101,200]]",
@@ -967,11 +992,11 @@
*
* @param editUris directories to create edit logs in
* @param numrolls number of times to roll the edit log during setup
+ * @param closeOnFinish whether to close the edit log after setup
* @param abortAtRolls Specifications for when to fail, see AbortSpec
*/
- public static NNStorage setupEdits(List<URI> editUris, int numrolls,
- AbortSpec... abortAtRolls)
- throws IOException {
+ public static NNStorage setupEdits(List<URI> editUris, int numrolls,
+ boolean closeOnFinish, AbortSpec... abortAtRolls) throws IOException {
List<AbortSpec> aborts = new ArrayList<AbortSpec>(Arrays.asList(abortAtRolls));
NNStorage storage = new NNStorage(new Configuration(),
Collections.<URI>emptyList(),
@@ -981,7 +1006,8 @@
// open the edit log and add two transactions
// logGenerationStamp is used, simply because it doesn't
// require complex arguments.
- editlog.open();
+ editlog.initJournalsForWrite();
+ editlog.openForWrite();
for (int i = 2; i < TXNS_PER_ROLL; i++) {
editlog.logGenerationStamp((long)0);
}
@@ -1009,16 +1035,34 @@
}
editlog.logSync();
}
- editlog.close();
+
+ if (closeOnFinish) {
+ editlog.close();
+ }
FSImageTestUtil.logStorageContents(LOG, storage);
return storage;
}
+
+ /**
+ * Set up directories for tests.
+ *
+ * Each rolled file is 10 txns long.
+ * A failed file is 2 txns long.
+ *
+ * @param editUris directories to create edit logs in
+ * @param numrolls number of times to roll the edit log during setup
+ * @param abortAtRolls Specifications for when to fail, see AbortSpec
+ */
+ public static NNStorage setupEdits(List<URI> editUris, int numrolls,
+ AbortSpec... abortAtRolls) throws IOException {
+ return setupEdits(editUris, numrolls, true, abortAtRolls);
+ }
/**
* Test loading an editlog which has had both its storage fail
* on alternating rolls. Two edit log directories are created.
- * The first on fails on odd rolls, the second on even. Test
+ * The first one fails on odd rolls, the second on even. Test
* that we are able to load the entire editlog regardless.
*/
@Test
@@ -1041,6 +1085,7 @@
new AbortSpec(10, 1));
long totaltxnread = 0;
FSEditLog editlog = new FSEditLog(storage);
+ editlog.initJournalsForWrite();
long startTxId = 1;
Iterable<EditLogInputStream> editStreams = editlog.selectInputStreams(startTxId,
TXNS_PER_ROLL*11);
@@ -1090,11 +1135,10 @@
assertTrue(files[0].delete());
FSEditLog editlog = new FSEditLog(storage);
+ editlog.initJournalsForWrite();
long startTxId = 1;
try {
- Iterable<EditLogInputStream> editStreams
- = editlog.selectInputStreams(startTxId, 4*TXNS_PER_ROLL);
-
+ editlog.selectInputStreams(startTxId, 4*TXNS_PER_ROLL);
fail("Should have thrown exception");
} catch (IOException ioe) {
GenericTestUtils.assertExceptionContains(
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java
index 77fd686..d14b2b2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogJournalFailures.java
@@ -42,6 +42,7 @@
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
+import org.mockito.Mockito;
import org.mockito.verification.VerificationMode;
public class TestEditLogJournalFailures {
@@ -144,21 +145,35 @@
DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY);
shutDownMiniCluster();
Configuration conf = new HdfsConfiguration();
- conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, editsDirs[1]);
+ conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, editsDirs[0]);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY, 0);
conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKED_VOLUMES_MINIMUM_KEY, 0);
setUpMiniCluster(conf, true);
assertTrue(doAnEdit());
// Invalidated the one required edits journal.
- invalidateEditsDirAtIndex(1, false, false);
+ invalidateEditsDirAtIndex(0, false, false);
+ JournalAndStream nonRequiredJas = getJournalAndStream(1);
+ EditLogFileOutputStream nonRequiredSpy =
+ spyOnStream(nonRequiredJas);
+
// Make sure runtime.exit(...) hasn't been called at all yet.
assertExitInvocations(0);
+ // ..and that the other stream is active.
+ assertTrue(nonRequiredJas.isActive());
+
// This will actually return true in the tests, since the NN will not in
// fact call Runtime.exit();
doAnEdit();
+ // Since the required directory failed setReadyToFlush, and that
+ // directory was listed prior to the non-required directory,
+ // we should not call setReadyToFlush on the non-required
+ // directory. Regression test for HDFS-2874.
+ Mockito.verify(nonRequiredSpy, Mockito.never()).setReadyToFlush();
+ assertFalse(nonRequiredJas.isActive());
+
// A single failure of a required journal should result in a call to
// runtime.exit(...).
assertExitInvocations(atLeast(1));
@@ -217,15 +232,10 @@
* @param index the index of the journal to take offline.
* @return the original <code>EditLogOutputStream</code> of the journal.
*/
- private EditLogOutputStream invalidateEditsDirAtIndex(int index,
+ private void invalidateEditsDirAtIndex(int index,
boolean failOnFlush, boolean failOnWrite) throws IOException {
- FSImage fsimage = cluster.getNamesystem().getFSImage();
- FSEditLog editLog = fsimage.getEditLog();
-
- JournalAndStream jas = editLog.getJournals().get(index);
- EditLogFileOutputStream elos =
- (EditLogFileOutputStream) jas.getCurrentStream();
- EditLogFileOutputStream spyElos = spy(elos);
+ JournalAndStream jas = getJournalAndStream(index);
+ EditLogFileOutputStream spyElos = spyOnStream(jas);
if (failOnWrite) {
doThrow(new IOException("fail on write()")).when(spyElos).write(
(FSEditLogOp) any());
@@ -237,25 +247,24 @@
.setReadyToFlush();
}
doNothing().when(spyElos).abort();
-
+ }
+
+ private EditLogFileOutputStream spyOnStream(JournalAndStream jas) {
+ EditLogFileOutputStream elos =
+ (EditLogFileOutputStream) jas.getCurrentStream();
+ EditLogFileOutputStream spyElos = spy(elos);
jas.setCurrentStreamForTests(spyElos);
-
- return elos;
+ return spyElos;
}
/**
- * Restore the journal at index <code>index</code> with the passed
- * {@link EditLogOutputStream}.
- *
- * @param index index of the journal to restore.
- * @param elos the {@link EditLogOutputStream} to put at that index.
+ * Pull out one of the JournalAndStream objects from the edit log.
*/
- private void restoreEditsDirAtIndex(int index, EditLogOutputStream elos) {
+ private JournalAndStream getJournalAndStream(int index) {
FSImage fsimage = cluster.getNamesystem().getFSImage();
FSEditLog editLog = fsimage.getEditLog();
- JournalAndStream jas = editLog.getJournals().get(index);
- jas.setCurrentStreamForTests(elos);
+ return editLog.getJournals().get(index);
}
/**
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java
index d3d6459..da66b45 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogRace.java
@@ -237,7 +237,7 @@
System.out.println("Verifying file: " + editFile);
FSEditLogLoader loader = new FSEditLogLoader(namesystem);
- int numEditsThisLog = loader.loadFSEdits(new EditLogFileInputStream(editFile),
+ long numEditsThisLog = loader.loadFSEdits(new EditLogFileInputStream(editFile),
startTxId);
System.out.println("Number of edits: " + numEditsThisLog);
@@ -375,6 +375,7 @@
true);
LOG.info("mkdirs complete");
} catch (Throwable ioe) {
+ LOG.fatal("Got exception", ioe);
deferredException.set(ioe);
waitToEnterFlush.countDown();
}
@@ -469,6 +470,7 @@
true);
LOG.info("mkdirs complete");
} catch (Throwable ioe) {
+ LOG.fatal("Got exception", ioe);
deferredException.set(ioe);
waitToEnterSync.countDown();
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java
index dcbeea6..fd1733a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSEditLogLoader.java
@@ -92,7 +92,7 @@
StringBuilder bld = new StringBuilder();
bld.append("^Error replaying edit log at offset \\d+");
- bld.append("On transaction ID \\d+\n");
+ bld.append(" on transaction ID \\d+\n");
bld.append("Recent opcode offsets: (\\d+\\s*){4}$");
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES)
@@ -167,7 +167,7 @@
SortedMap<Long, Long> offsetToTxId = Maps.newTreeMap();
try {
fsel = FSImageTestUtil.createStandaloneEditLog(testDir);
- fsel.open();
+ fsel.openForWrite();
assertTrue("should exist: " + logFile, logFile.exists());
for (int i = 0; i < NUM_TXNS; i++) {
@@ -245,7 +245,9 @@
Files.copy(logFileBak, logFile);
corruptByteInFile(logFile, offset);
EditLogValidation val = EditLogFileInputStream.validateEditLog(logFile);
- assertTrue(val.getNumTransactions() >= prevNumValid);
+ assertTrue(String.format("%d should have been >= %d",
+ val.getNumTransactions(), prevNumValid),
+ val.getNumTransactions() >= prevNumValid);
prevNumValid = val.getNumTransactions();
}
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java
new file mode 100644
index 0000000..de3a89c
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystem.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+public class TestFSNamesystem {
+
+ /**
+ * Tests that the namenode edits dirs are gotten with duplicates removed
+ */
+ @Test
+ public void testUniqueEditDirs() throws IOException {
+ Configuration config = new Configuration();
+
+ config.set(DFS_NAMENODE_EDITS_DIR_KEY, "file://edits/dir, "
+ + "file://edits/dir1,file://edits/dir1"); // overlapping internally
+
+ // getNamespaceEditsDirs removes duplicates
+ Collection<URI> editsDirs = FSNamesystem.getNamespaceEditsDirs(config);
+ assertEquals(2, editsDirs.size());
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java
index e4ff4bb..0ac1944 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFileJournalManager.java
@@ -29,7 +29,9 @@
import java.io.FilenameFilter;
import java.io.IOException;
import org.junit.Test;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.JournalManager.CorruptionException;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.test.GenericTestUtils;
import static org.apache.hadoop.hdfs.server.namenode.TestEditLog.setupEdits;
@@ -58,8 +60,8 @@
long numJournals = 0;
for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) {
- FileJournalManager jm = new FileJournalManager(sd);
- assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1));
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+ assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true));
numJournals++;
}
assertEquals(3, numJournals);
@@ -78,9 +80,9 @@
5, new AbortSpec(5, 0));
StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
- FileJournalManager jm = new FileJournalManager(sd);
+ FileJournalManager jm = new FileJournalManager(sd, storage);
assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL,
- jm.getNumberOfTransactions(1));
+ jm.getNumberOfTransactions(1, true));
}
/**
@@ -101,16 +103,17 @@
5, new AbortSpec(5, 1));
Iterator<StorageDirectory> dirs = storage.dirIterator(NameNodeDirType.EDITS);
StorageDirectory sd = dirs.next();
- FileJournalManager jm = new FileJournalManager(sd);
- assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1));
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+ assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true));
sd = dirs.next();
- jm = new FileJournalManager(sd);
- assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1));
+ jm = new FileJournalManager(sd, storage);
+ assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1,
+ true));
sd = dirs.next();
- jm = new FileJournalManager(sd);
- assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1));
+ jm = new FileJournalManager(sd, storage);
+ assertEquals(6*TXNS_PER_ROLL, jm.getNumberOfTransactions(1, true));
}
/**
@@ -133,16 +136,19 @@
new AbortSpec(5, 2));
Iterator<StorageDirectory> dirs = storage.dirIterator(NameNodeDirType.EDITS);
StorageDirectory sd = dirs.next();
- FileJournalManager jm = new FileJournalManager(sd);
- assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1));
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+ assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1,
+ true));
sd = dirs.next();
- jm = new FileJournalManager(sd);
- assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1));
+ jm = new FileJournalManager(sd, storage);
+ assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1,
+ true));
sd = dirs.next();
- jm = new FileJournalManager(sd);
- assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1));
+ jm = new FileJournalManager(sd, storage);
+ assertEquals(5*TXNS_PER_ROLL + TXNS_PER_FAIL, jm.getNumberOfTransactions(1,
+ true));
}
/**
@@ -156,6 +162,25 @@
}
raf.close();
}
+
+ @Test(expected=IllegalStateException.class)
+ public void testFinalizeErrorReportedToNNStorage() throws IOException, InterruptedException {
+ File f = new File(TestEditLog.TEST_DIR + "/filejournaltestError");
+ // abort after 10th roll
+ NNStorage storage = setupEdits(Collections.<URI>singletonList(f.toURI()),
+ 10, new AbortSpec(10, 0));
+ StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
+
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+ String sdRootPath = sd.getRoot().getAbsolutePath();
+ FileUtil.chmod(sdRootPath, "-w", true);
+ try {
+ jm.finalizeLogSegment(0, 1);
+ } finally {
+ assertTrue(storage.getRemovedStorageDirs().contains(sd));
+ FileUtil.chmod(sdRootPath, "+w", true);
+ }
+ }
/**
* Test that we can read from a stream created by FileJournalManager.
@@ -171,17 +196,17 @@
10, new AbortSpec(10, 0));
StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
- FileJournalManager jm = new FileJournalManager(sd);
+ FileJournalManager jm = new FileJournalManager(sd, storage);
long expectedTotalTxnCount = TXNS_PER_ROLL*10 + TXNS_PER_FAIL;
- assertEquals(expectedTotalTxnCount, jm.getNumberOfTransactions(1));
+ assertEquals(expectedTotalTxnCount, jm.getNumberOfTransactions(1, true));
long skippedTxns = (3*TXNS_PER_ROLL); // skip first 3 files
long startingTxId = skippedTxns + 1;
- long numTransactionsToLoad = jm.getNumberOfTransactions(startingTxId);
+ long numTransactionsToLoad = jm.getNumberOfTransactions(startingTxId, true);
long numLoaded = 0;
while (numLoaded < numTransactionsToLoad) {
- EditLogInputStream editIn = jm.getInputStream(startingTxId);
+ EditLogInputStream editIn = jm.getInputStream(startingTxId, true);
FSEditLogLoader.EditLogValidation val = FSEditLogLoader.validateEditLog(editIn);
long count = val.getNumTransactions();
@@ -194,20 +219,26 @@
}
/**
- * Try to make a request with a start transaction id which doesn't
- * match the start ID of some log segment.
- * This should fail as edit logs must currently be treated as indevisable
- * units.
+ * Make requests with starting transaction ids which don't match the beginning
+ * txid of some log segments.
+ *
+ * This should succeed.
*/
- @Test(expected=IOException.class)
+ @Test
public void testAskForTransactionsMidfile() throws IOException {
File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2");
NNStorage storage = setupEdits(Collections.<URI>singletonList(f.toURI()),
10);
StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
- FileJournalManager jm = new FileJournalManager(sd);
- jm.getNumberOfTransactions(2);
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+
+ // 10 rolls, so 11 rolled files, 110 txids total.
+ final int TOTAL_TXIDS = 10 * 11;
+ for (int txid = 1; txid <= TOTAL_TXIDS; txid++) {
+ assertEquals((TOTAL_TXIDS - txid) + 1, jm.getNumberOfTransactions(txid,
+ true));
+ }
}
/**
@@ -237,19 +268,20 @@
assertEquals(1, files.length);
assertTrue(files[0].delete());
- FileJournalManager jm = new FileJournalManager(sd);
- assertEquals(startGapTxId-1, jm.getNumberOfTransactions(1));
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+ assertEquals(startGapTxId-1, jm.getNumberOfTransactions(1, true));
try {
- jm.getNumberOfTransactions(startGapTxId);
+ jm.getNumberOfTransactions(startGapTxId, true);
fail("Should have thrown an exception by now");
} catch (IOException ioe) {
- assertTrue(true);
+ GenericTestUtils.assertExceptionContains(
+ "Gap in transactions, max txnid is 110, 0 txns from 31", ioe);
}
// rolled 10 times so there should be 11 files.
assertEquals(11*TXNS_PER_ROLL - endGapTxId,
- jm.getNumberOfTransactions(endGapTxId+1));
+ jm.getNumberOfTransactions(endGapTxId + 1, true));
}
/**
@@ -274,9 +306,9 @@
corruptAfterStartSegment(files[0]);
- FileJournalManager jm = new FileJournalManager(sd);
+ FileJournalManager jm = new FileJournalManager(sd, storage);
assertEquals(10*TXNS_PER_ROLL+1,
- jm.getNumberOfTransactions(1));
+ jm.getNumberOfTransactions(1, true));
}
@Test
@@ -288,14 +320,15 @@
NNStorage.getInProgressEditsFileName(201),
NNStorage.getFinalizedEditsFileName(1001, 1100));
- FileJournalManager fjm = new FileJournalManager(sd);
+ // passing null for NNStorage because this unit test will not use it
+ FileJournalManager fjm = new FileJournalManager(sd, null);
assertEquals("[1,100],[101,200],[1001,1100]", getLogsAsString(fjm, 1));
assertEquals("[101,200],[1001,1100]", getLogsAsString(fjm, 101));
assertEquals("[1001,1100]", getLogsAsString(fjm, 201));
try {
assertEquals("[]", getLogsAsString(fjm, 150));
fail("Did not throw when asking for a txn in the middle of a log");
- } catch (IOException ioe) {
+ } catch (IllegalStateException ioe) {
GenericTestUtils.assertExceptionContains(
"150 which is in the middle", ioe);
}
@@ -303,6 +336,60 @@
"", getLogsAsString(fjm, 9999));
}
+ /**
+ * tests that passing an invalid dir to matchEditLogs throws IOException
+ */
+ @Test(expected = IOException.class)
+ public void testMatchEditLogInvalidDirThrowsIOException() throws IOException {
+ File badDir = new File("does not exist");
+ FileJournalManager.matchEditLogs(badDir);
+ }
+
+ /**
+ * Make sure that we starting reading the correct op when we request a stream
+ * with a txid in the middle of an edit log file.
+ */
+ @Test
+ public void testReadFromMiddleOfEditLog() throws CorruptionException,
+ IOException {
+ File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2");
+ NNStorage storage = setupEdits(Collections.<URI>singletonList(f.toURI()),
+ 10);
+ StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
+
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+
+ EditLogInputStream elis = jm.getInputStream(5, true);
+ FSEditLogOp op = elis.readOp();
+ assertEquals("read unexpected op", op.getTransactionId(), 5);
+ }
+
+ /**
+ * Make sure that in-progress streams aren't counted if we don't ask for
+ * them.
+ */
+ @Test
+ public void testExcludeInProgressStreams() throws CorruptionException,
+ IOException {
+ File f = new File(TestEditLog.TEST_DIR + "/filejournaltest2");
+
+ // Don't close the edit log once the files have been set up.
+ NNStorage storage = setupEdits(Collections.<URI>singletonList(f.toURI()),
+ 10, false);
+ StorageDirectory sd = storage.dirIterator(NameNodeDirType.EDITS).next();
+
+ FileJournalManager jm = new FileJournalManager(sd, storage);
+
+ // If we exclude the in-progess stream, we should only have 100 tx.
+ assertEquals(100, jm.getNumberOfTransactions(1, false));
+
+ EditLogInputStream elis = jm.getInputStream(90, false);
+ FSEditLogOp lastReadOp = null;
+ while ((lastReadOp = elis.readOp()) != null) {
+ assertTrue(lastReadOp.getTransactionId() <= 100);
+ }
+ }
+
private static String getLogsAsString(
FileJournalManager fjm, long firstTxId) throws IOException {
return Joiner.on(",").join(fjm.getRemoteEditLogs(firstTxId));
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java
index 00fe43f..51e49a9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestGenericJournalConf.java
@@ -144,13 +144,13 @@
}
@Override
- public EditLogInputStream getInputStream(long fromTxnId)
+ public EditLogInputStream getInputStream(long fromTxnId, boolean inProgressOk)
throws IOException {
return null;
}
@Override
- public long getNumberOfTransactions(long fromTxnId)
+ public long getNumberOfTransactions(long fromTxnId, boolean inProgressOk)
throws IOException {
return 0;
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java
index aad8d7d..e7a9cc1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionFunctional.java
@@ -61,6 +61,7 @@
throws IOException {
MiniDFSCluster cluster = null;
Configuration conf = new HdfsConfiguration();
+ conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, 0);
File sd0 = new File(TEST_ROOT_DIR, "nn0");
File sd1 = new File(TEST_ROOT_DIR, "nn1");
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java
index aadca5c..4c6334f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNNStorageRetentionManager.java
@@ -23,6 +23,7 @@
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
@@ -33,6 +34,7 @@
import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger;
import org.junit.Assert;
+import org.junit.Before;
import org.junit.Test;
import org.mockito.ArgumentCaptor;
import org.mockito.Mockito;
@@ -46,6 +48,17 @@
public class TestNNStorageRetentionManager {
+ Configuration conf = new Configuration();
+
+ /**
+ * For the purpose of this test, purge as many edits as we can
+ * with no extra "safety cushion"
+ */
+ @Before
+ public void setNoExtraEditRetention() {
+ conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY, 0);
+ }
+
/**
* Test the "easy case" where we have more images in the
* directory than we need to keep. Should purge the
@@ -163,9 +176,27 @@
runTest(tc);
}
- private void runTest(TestCaseDescription tc) throws IOException {
- Configuration conf = new Configuration();
+ @Test
+ public void testRetainExtraLogs() throws IOException {
+ conf.setLong(DFSConfigKeys.DFS_NAMENODE_NUM_EXTRA_EDITS_RETAINED_KEY,
+ 50);
+ TestCaseDescription tc = new TestCaseDescription();
+ tc.addRoot("/foo1", NameNodeDirType.IMAGE);
+ tc.addRoot("/foo2", NameNodeDirType.EDITS);
+ tc.addImage("/foo1/current/" + getImageFileName(100), true);
+ tc.addImage("/foo1/current/" + getImageFileName(200), true);
+ tc.addImage("/foo1/current/" + getImageFileName(300), false);
+ tc.addImage("/foo1/current/" + getImageFileName(400), false);
+ tc.addLog("/foo2/current/" + getFinalizedEditsFileName(101, 200), true);
+ // Since we need 50 extra edits, *do* retain the 201-300 segment
+ tc.addLog("/foo2/current/" + getFinalizedEditsFileName(201, 300), false);
+ tc.addLog("/foo2/current/" + getFinalizedEditsFileName(301, 400), false);
+ tc.addLog("/foo2/current/" + getInProgressEditsFileName(401), false);
+ runTest(tc);
+ }
+
+ private void runTest(TestCaseDescription tc) throws IOException {
StoragePurger mockPurger =
Mockito.mock(NNStorageRetentionManager.StoragePurger.class);
ArgumentCaptor<FSImageFile> imagesPurgedCaptor =
@@ -261,8 +292,9 @@
for (FakeRoot root : dirRoots.values()) {
if (!root.type.isOfType(NameNodeDirType.EDITS)) continue;
+ // passing null NNStorage for unit test because it does not use it
FileJournalManager fjm = new FileJournalManager(
- root.mockStorageDir());
+ root.mockStorageDir(), null);
fjm.purger = purger;
jms.add(fjm);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java
index 559d165..49a96e9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java
@@ -50,13 +50,7 @@
assertFalse(testResourceScenario(4, 0, 3, 0, 2));
assertTrue(testResourceScenario(4, 0, 3, 0, 1));
assertFalse(testResourceScenario(4, 0, 4, 0, 1));
- try {
- testResourceScenario(1, 0, 0, 0, 2);
- fail("Should fail if there are more minimum redundant resources than " +
- "total redundant resources");
- } catch (RuntimeException rte) {
- assertTrue(rte.getMessage().startsWith("Need a minimum"));
- }
+ assertFalse(testResourceScenario(1, 0, 0, 0, 2));
}
@Test
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java
index d4fd72d..596df8d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java
@@ -24,6 +24,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
@@ -91,6 +92,9 @@
FileSystem fileSys = null;
try {
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(NUM_DATA_NODES).build();
cluster.waitActive();
fileSys = cluster.getFileSystem();
@@ -106,7 +110,6 @@
// set small size of flush buffer
editLog.setOutputBufferCapacity(2048);
- namesystem.getDelegationTokenSecretManager().startThreads();
// Create threads and make them run transactions concurrently.
Thread threadId[] = new Thread[NUM_THREADS];
@@ -141,7 +144,7 @@
System.out.println("Verifying file: " + editFile);
FSEditLogLoader loader = new FSEditLogLoader(namesystem);
- int numEdits = loader.loadFSEdits(
+ long numEdits = loader.loadFSEdits(
new EditLogFileInputStream(editFile), 1);
assertEquals("Verification for " + editFile, expectedTransactions, numEdits);
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java
index 9233009..74c3cf8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestStartup.java
@@ -512,11 +512,10 @@
InetAddress inetAddress = InetAddress.getByAddress(b);
list.add(inetAddress.getHostName());
writeConfigFile(localFileSys, hostsFile, list);
- int numNameNodes = 1;
int numDatanodes = 1;
try {
- cluster = new MiniDFSCluster.Builder(conf).numNameNodes(numNameNodes)
+ cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(numDatanodes).setupHostsFile(true).build();
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java
index 397ad72..53f4f96 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestValidateConfigurationSettings.java
@@ -19,10 +19,12 @@
import static org.junit.Assert.*;
import org.junit.Test;
+import java.io.File;
import java.io.IOException;
import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -71,4 +73,25 @@
DFSTestUtil.formatNameNode(conf);
NameNode nameNode = new NameNode(conf); // should be OK!
}
+
+ /**
+ * HDFS-3013: NameNode format command doesn't pick up
+ * dfs.namenode.name.dir.NameServiceId configuration.
+ */
+ @Test
+ public void testGenericKeysForNameNodeFormat()
+ throws IOException {
+ Configuration conf = new HdfsConfiguration();
+ FileSystem.setDefaultUri(conf, "hdfs://localhost:8070");
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, "ns1");
+ String nameDir = System.getProperty("java.io.tmpdir") + "/test.dfs.name";
+ File dir = new File(nameDir);
+ if (dir.exists()) {
+ FileUtil.fullyDelete(dir);
+ }
+ conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY + ".ns1", nameDir);
+ DFSTestUtil.formatNameNode(conf);
+ NameNode nameNode = new NameNode(conf);
+ FileUtil.fullyDelete(dir);
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java
new file mode 100644
index 0000000..39667ed
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HAStressTestHarness.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
+import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
+
+/**
+ * Utility class to start an HA cluster, and then start threads
+ * to periodically fail back and forth, accelerate block deletion
+ * processing, etc.
+ */
+public class HAStressTestHarness {
+ Configuration conf;
+ private MiniDFSCluster cluster;
+ static final int BLOCK_SIZE = 1024;
+ TestContext testCtx = new TestContext();
+
+ public HAStressTestHarness() {
+ conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+ conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ // Increase max streams so that we re-replicate quickly.
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000);
+ }
+
+ /**
+ * Start and return the MiniDFSCluster.
+ */
+ public MiniDFSCluster startCluster() throws IOException {
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ return cluster;
+ }
+
+ /**
+ * Return a filesystem with client-failover configured for the
+ * cluster.
+ */
+ public FileSystem getFailoverFs() throws IOException, URISyntaxException {
+ return HATestUtil.configureFailoverFs(cluster, conf);
+ }
+
+ /**
+ * Add a thread which periodically triggers deletion reports,
+ * heartbeats, and NN-side block work.
+ * @param interval millisecond period on which to run
+ */
+ public void addReplicationTriggerThread(final int interval) {
+
+ testCtx.addThread(new RepeatingTestThread(testCtx) {
+
+ @Override
+ public void doAnAction() throws Exception {
+ for (DataNode dn : cluster.getDataNodes()) {
+ DataNodeAdapter.triggerDeletionReport(dn);
+ DataNodeAdapter.triggerHeartbeat(dn);
+ }
+ for (int i = 0; i < 2; i++) {
+ NameNode nn = cluster.getNameNode(i);
+ BlockManagerTestUtil.computeAllPendingWork(
+ nn.getNamesystem().getBlockManager());
+ }
+ Thread.sleep(interval);
+ }
+ });
+ }
+
+ /**
+ * Add a thread which periodically triggers failover back and forth between
+ * the two namenodes.
+ */
+ public void addFailoverThread(final int msBetweenFailovers) {
+ testCtx.addThread(new RepeatingTestThread(testCtx) {
+
+ @Override
+ public void doAnAction() throws Exception {
+ System.err.println("==============================\n" +
+ "Failing over from 0->1\n" +
+ "==================================");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ Thread.sleep(msBetweenFailovers);
+ System.err.println("==============================\n" +
+ "Failing over from 1->0\n" +
+ "==================================");
+
+ cluster.transitionToStandby(1);
+ cluster.transitionToActive(0);
+ Thread.sleep(msBetweenFailovers);
+ }
+ });
+ }
+
+ /**
+ * Start all of the threads which have been added.
+ */
+ public void startThreads() {
+ this.testCtx.startThreads();
+ }
+
+ /**
+ * Stop threads, propagating any exceptions that might have been thrown.
+ */
+ public void stopThreads() throws Exception {
+ this.testCtx.stop();
+ }
+
+ /**
+ * Shutdown the minicluster, as well as any of the running threads.
+ */
+ public void shutdown() throws Exception {
+ this.testCtx.stop();
+ if (cluster != null) {
+ this.cluster.shutdown();
+ cluster = null;
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java
new file mode 100644
index 0000000..bf919ce
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/HATestUtil.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.test.GenericTestUtils;
+
+import com.google.common.base.Supplier;
+
+/**
+ * Static utility functions useful for testing HA.
+ */
+public abstract class HATestUtil {
+ private static Log LOG = LogFactory.getLog(HATestUtil.class);
+
+ private static final String LOGICAL_HOSTNAME = "ha-nn-uri-%d";
+
+ /**
+ * Trigger an edits log roll on the active and then wait for the standby to
+ * catch up to all the edits done by the active. This method will check
+ * repeatedly for up to NN_LAG_TIMEOUT milliseconds, and then fail throwing
+ * {@link CouldNotCatchUpException}
+ *
+ * @param active active NN
+ * @param standby standby NN which should catch up to active
+ * @throws IOException if an error occurs rolling the edit log
+ * @throws CouldNotCatchUpException if the standby doesn't catch up to the
+ * active in NN_LAG_TIMEOUT milliseconds
+ */
+ static void waitForStandbyToCatchUp(NameNode active,
+ NameNode standby) throws InterruptedException, IOException, CouldNotCatchUpException {
+
+ long activeTxId = active.getNamesystem().getFSImage().getEditLog()
+ .getLastWrittenTxId();
+
+ active.getRpcServer().rollEditLog();
+
+ long start = System.currentTimeMillis();
+ while (System.currentTimeMillis() - start < TestEditLogTailer.NN_LAG_TIMEOUT) {
+ long nn2HighestTxId = standby.getNamesystem().getFSImage()
+ .getLastAppliedTxId();
+ if (nn2HighestTxId >= activeTxId) {
+ return;
+ }
+ Thread.sleep(TestEditLogTailer.SLEEP_TIME);
+ }
+ throw new CouldNotCatchUpException("Standby did not catch up to txid " +
+ activeTxId + " (currently at " +
+ standby.getNamesystem().getFSImage().getLastAppliedTxId() + ")");
+ }
+
+ /**
+ * Wait for the datanodes in the cluster to process any block
+ * deletions that have already been asynchronously queued.
+ */
+ static void waitForDNDeletions(final MiniDFSCluster cluster)
+ throws TimeoutException, InterruptedException {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ for (DataNode dn : cluster.getDataNodes()) {
+ if (DataNodeAdapter.getPendingAsyncDeletions(dn) > 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }, 1000, 10000);
+
+ }
+
+ /**
+ * Wait for the NameNode to issue any deletions that are already
+ * pending (i.e. for the pendingDeletionBlocksCount to go to 0)
+ */
+ static void waitForNNToIssueDeletions(final NameNode nn)
+ throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ LOG.info("Waiting for NN to issue block deletions to DNs");
+ return nn.getNamesystem().getBlockManager().getPendingDeletionBlocksCount() == 0;
+ }
+ }, 250, 10000);
+ }
+
+ public static class CouldNotCatchUpException extends IOException {
+ private static final long serialVersionUID = 1L;
+
+ public CouldNotCatchUpException(String message) {
+ super(message);
+ }
+ }
+
+ /** Gets the filesystem instance by setting the failover configurations */
+ public static FileSystem configureFailoverFs(MiniDFSCluster cluster, Configuration conf)
+ throws IOException, URISyntaxException {
+ return configureFailoverFs(cluster, conf, 0);
+ }
+
+ /**
+ * Gets the filesystem instance by setting the failover configurations
+ * @param cluster the single process DFS cluster
+ * @param conf cluster configuration
+ * @param nsIndex namespace index starting with zero
+ * @throws IOException if an error occurs rolling the edit log
+ */
+ public static FileSystem configureFailoverFs(MiniDFSCluster cluster, Configuration conf,
+ int nsIndex) throws IOException, URISyntaxException {
+ conf = new Configuration(conf);
+ String logicalName = getLogicalHostname(cluster);
+ setFailoverConfigurations(cluster, conf, logicalName, nsIndex);
+ FileSystem fs = FileSystem.get(new URI("hdfs://" + logicalName), conf);
+ return fs;
+ }
+
+ public static void setFailoverConfigurations(MiniDFSCluster cluster,
+ Configuration conf) {
+ setFailoverConfigurations(cluster, conf, getLogicalHostname(cluster));
+ }
+
+ /** Sets the required configurations for performing failover of default namespace. */
+ public static void setFailoverConfigurations(MiniDFSCluster cluster,
+ Configuration conf, String logicalName) {
+ setFailoverConfigurations(cluster, conf, logicalName, 0);
+ }
+
+ /** Sets the required configurations for performing failover. */
+ public static void setFailoverConfigurations(MiniDFSCluster cluster,
+ Configuration conf, String logicalName, int nsIndex) {
+ InetSocketAddress nnAddr1 = cluster.getNameNode(2 * nsIndex).getNameNodeAddress();
+ InetSocketAddress nnAddr2 = cluster.getNameNode(2 * nsIndex + 1).getNameNodeAddress();
+ String nameNodeId1 = "nn1";
+ String nameNodeId2 = "nn2";
+ String address1 = "hdfs://" + nnAddr1.getHostName() + ":" + nnAddr1.getPort();
+ String address2 = "hdfs://" + nnAddr2.getHostName() + ":" + nnAddr2.getPort();
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY,
+ logicalName, nameNodeId1), address1);
+ conf.set(DFSUtil.addKeySuffixes(DFS_NAMENODE_RPC_ADDRESS_KEY,
+ logicalName, nameNodeId2), address2);
+
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, logicalName);
+ conf.set(DFSUtil.addKeySuffixes(DFS_HA_NAMENODES_KEY_PREFIX, logicalName),
+ nameNodeId1 + "," + nameNodeId2);
+ conf.set(DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX + "." + logicalName,
+ ConfiguredFailoverProxyProvider.class.getName());
+ conf.set("fs.defaultFS", "hdfs://" + logicalName);
+ }
+
+
+ public static String getLogicalHostname(MiniDFSCluster cluster) {
+ return String.format(LOGICAL_HOSTNAME, cluster.getInstanceId());
+ }
+
+ public static URI getLogicalUri(MiniDFSCluster cluster)
+ throws URISyntaxException {
+ return new URI(HdfsConstants.HDFS_URI_SCHEME + "://" +
+ getLogicalHostname(cluster));
+ }
+
+ public static void waitForCheckpoint(MiniDFSCluster cluster, int nnIdx,
+ List<Integer> txids) throws InterruptedException {
+ long start = System.currentTimeMillis();
+ while (true) {
+ try {
+ FSImageTestUtil.assertNNHasCheckpoints(cluster, nnIdx, txids);
+ return;
+ } catch (AssertionError err) {
+ if (System.currentTimeMillis() - start > 10000) {
+ throw err;
+ } else {
+ Thread.sleep(300);
+ }
+ }
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java
new file mode 100644
index 0000000..ccc46a2
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDFSUpgradeWithHA.java
@@ -0,0 +1,107 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.common.Storage;
+import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Tests for upgrading with HA enabled.
+ */
+public class TestDFSUpgradeWithHA {
+
+ private static final Log LOG = LogFactory.getLog(TestDFSUpgradeWithHA.class);
+
+ /**
+ * Make sure that an HA NN refuses to start if given an upgrade-related
+ * startup option.
+ */
+ @Test
+ public void testStartingWithUpgradeOptionsFails() throws IOException {
+ for (StartupOption startOpt : Lists.newArrayList(new StartupOption[] {
+ StartupOption.UPGRADE, StartupOption.FINALIZE,
+ StartupOption.ROLLBACK })) {
+ MiniDFSCluster cluster = null;
+ try {
+ cluster = new MiniDFSCluster.Builder(new Configuration())
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .startupOption(startOpt)
+ .numDataNodes(0)
+ .build();
+ fail("Should not have been able to start an HA NN in upgrade mode");
+ } catch (IllegalArgumentException iae) {
+ GenericTestUtils.assertExceptionContains(
+ "Cannot perform DFS upgrade with HA enabled.", iae);
+ LOG.info("Got expected exception", iae);
+ } finally {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
+ }
+
+ /**
+ * Make sure that an HA NN won't start if a previous upgrade was in progress.
+ */
+ @Test
+ public void testStartingWithUpgradeInProgressFails() throws Exception {
+ MiniDFSCluster cluster = null;
+ try {
+ cluster = new MiniDFSCluster.Builder(new Configuration())
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+
+ // Simulate an upgrade having started.
+ for (int i = 0; i < 2; i++) {
+ for (URI uri : cluster.getNameDirs(i)) {
+ File prevTmp = new File(new File(uri), Storage.STORAGE_TMP_PREVIOUS);
+ LOG.info("creating previous tmp dir: " + prevTmp);
+ assertTrue(prevTmp.mkdirs());
+ }
+ }
+
+ cluster.restartNameNodes();
+ fail("Should not have been able to start an HA NN with an in-progress upgrade");
+ } catch (IOException ioe) {
+ GenericTestUtils.assertExceptionContains(
+ "Cannot start an HA namenode with name dirs that need recovery.",
+ ioe);
+ LOG.info("Got expected exception", ioe);
+ } finally {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java
new file mode 100644
index 0000000..ea769c0
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencing.java
@@ -0,0 +1,605 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.CountDownLatch;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.AppendTestUtil;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
+import org.apache.hadoop.hdfs.server.namenode.FSInodeInfo;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
+import org.apache.log4j.Level;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+
+import com.google.common.base.Supplier;
+import com.google.common.collect.Lists;
+
+
+public class TestDNFencing {
+
+ protected static final Log LOG = LogFactory.getLog(
+ TestDNFencing.class);
+ private static final String TEST_FILE_DATA = "hello highly available world";
+ private static final String TEST_FILE = "/testStandbyIsHot";
+ private static final Path TEST_FILE_PATH = new Path(TEST_FILE);
+ private static final int SMALL_BLOCK = 1024;
+
+ private Configuration conf;
+ private MiniDFSCluster cluster;
+ private NameNode nn1, nn2;
+ private FileSystem fs;
+
+ static {
+ ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
+ }
+
+ @Before
+ public void setupCluster() throws Exception {
+ conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, SMALL_BLOCK);
+ // Bump up replication interval so that we only run replication
+ // checks explicitly.
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 600);
+ // Increase max streams so that we re-replicate quickly.
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000);
+ // See RandomDeleterPolicy javadoc.
+ conf.setClass("dfs.block.replicator.classname", RandomDeleterPolicy.class,
+ BlockPlacementPolicy.class);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ nn1 = cluster.getNameNode(0);
+ nn2 = cluster.getNameNode(1);
+
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ // Trigger block reports so that the first NN trusts all
+ // of the DNs, and will issue deletions
+ cluster.triggerBlockReports();
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+ }
+
+ @After
+ public void shutdownCluster() throws Exception {
+ if (cluster != null) {
+ banner("Shutting down cluster. NN1 metadata:");
+ doMetasave(nn1);
+ banner("Shutting down cluster. NN2 metadata:");
+ doMetasave(nn2);
+ cluster.shutdown();
+ }
+ }
+
+
+ @Test
+ public void testDnFencing() throws Exception {
+ // Create a file with replication level 3.
+ DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L);
+ ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH);
+
+ // Drop its replication count to 1, so it becomes over-replicated.
+ // Then compute the invalidation of the extra blocks and trigger
+ // heartbeats so the invalidations are flushed to the DNs.
+ nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn1.getNamesystem().getBlockManager());
+ cluster.triggerHeartbeats();
+
+ // Transition nn2 to active even though nn1 still thinks it's active.
+ banner("Failing to NN2 but let NN1 continue to think it's active");
+ NameNodeAdapter.abortEditLogs(nn1);
+ NameNodeAdapter.enterSafeMode(nn1, false);
+ cluster.transitionToActive(1);
+
+ // Check that the standby picked up the replication change.
+ assertEquals(1,
+ nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
+
+ // Dump some info for debugging purposes.
+ banner("NN2 Metadata immediately after failover");
+ doMetasave(nn2);
+
+ // Even though NN2 considers the blocks over-replicated, it should
+ // post-pone the block invalidation because the DNs are still "stale".
+ assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
+
+ banner("Triggering heartbeats and block reports so that fencing is completed");
+ cluster.triggerHeartbeats();
+ cluster.triggerBlockReports();
+
+ banner("Metadata after nodes have all block-reported");
+ doMetasave(nn2);
+
+ // The blocks should no longer be postponed.
+ assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
+
+ // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+ assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
+ assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
+
+ banner("Making sure the file is still readable");
+ FileSystem fs2 = cluster.getFileSystem(1);
+ DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
+
+ banner("Waiting for the actual block files to get deleted from DNs.");
+ waitForTrueReplication(cluster, block, 1);
+ }
+
+ /**
+ * Test case which restarts the standby node in such a way that,
+ * when it exits safemode, it will want to invalidate a bunch
+ * of over-replicated block replicas. Ensures that if we failover
+ * at this point it won't lose data.
+ */
+ @Test
+ public void testNNClearsCommandsOnFailoverAfterStartup()
+ throws Exception {
+ // Make lots of blocks to increase chances of triggering a bug.
+ DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L);
+
+ banner("Shutting down NN2");
+ cluster.shutdownNameNode(1);
+
+ banner("Setting replication to 1, rolling edit log.");
+ nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
+ nn1.getRpcServer().rollEditLog();
+
+ // Start NN2 again. When it starts up, it will see all of the
+ // blocks as over-replicated, since it has the metadata for
+ // replication=1, but the DNs haven't yet processed the deletions.
+ banner("Starting NN2 again.");
+ cluster.restartNameNode(1);
+ nn2 = cluster.getNameNode(1);
+
+ banner("triggering BRs");
+ cluster.triggerBlockReports();
+
+ // We expect that both NN1 and NN2 will have some number of
+ // deletions queued up for the DNs.
+ banner("computing invalidation on nn1");
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn1.getNamesystem().getBlockManager());
+
+ banner("computing invalidation on nn2");
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+
+ // Dump some info for debugging purposes.
+ banner("Metadata immediately before failover");
+ doMetasave(nn2);
+
+
+ // Transition nn2 to active even though nn1 still thinks it's active
+ banner("Failing to NN2 but let NN1 continue to think it's active");
+ NameNodeAdapter.abortEditLogs(nn1);
+ NameNodeAdapter.enterSafeMode(nn1, false);
+
+ cluster.transitionToActive(1);
+
+ // Check that the standby picked up the replication change.
+ assertEquals(1,
+ nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
+
+ // Dump some info for debugging purposes.
+ banner("Metadata immediately after failover");
+ doMetasave(nn2);
+
+ banner("Triggering heartbeats and block reports so that fencing is completed");
+ cluster.triggerHeartbeats();
+ cluster.triggerBlockReports();
+
+ banner("Metadata after nodes have all block-reported");
+ doMetasave(nn2);
+
+ // The block should no longer be postponed.
+ assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
+
+ // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+
+ HATestUtil.waitForNNToIssueDeletions(nn2);
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+ assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
+ assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
+
+ banner("Making sure the file is still readable");
+ FileSystem fs2 = cluster.getFileSystem(1);
+ DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
+ }
+
+ /**
+ * Test case that reduces replication of a file with a lot of blocks
+ * and then fails over right after those blocks enter the DN invalidation
+ * queues on the active. Ensures that fencing is correct and no replicas
+ * are lost.
+ */
+ @Test
+ public void testNNClearsCommandsOnFailoverWithReplChanges()
+ throws Exception {
+ // Make lots of blocks to increase chances of triggering a bug.
+ DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)1, 1L);
+
+ banner("rolling NN1's edit log, forcing catch-up");
+ HATestUtil.waitForStandbyToCatchUp(nn1, nn2);
+
+ // Get some new replicas reported so that NN2 now considers
+ // them over-replicated and schedules some more deletions
+ nn1.getRpcServer().setReplication(TEST_FILE, (short) 2);
+ while (BlockManagerTestUtil.getComputedDatanodeWork(
+ nn1.getNamesystem().getBlockManager()) > 0) {
+ LOG.info("Getting more replication work computed");
+ }
+ BlockManager bm1 = nn1.getNamesystem().getBlockManager();
+ while (bm1.getPendingReplicationBlocksCount() > 0) {
+ BlockManagerTestUtil.updateState(bm1);
+ cluster.triggerHeartbeats();
+ Thread.sleep(1000);
+ }
+
+ banner("triggering BRs");
+ cluster.triggerBlockReports();
+
+ nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
+
+
+ banner("computing invalidation on nn1");
+
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn1.getNamesystem().getBlockManager());
+ doMetasave(nn1);
+
+ banner("computing invalidation on nn2");
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+ doMetasave(nn2);
+
+ // Dump some info for debugging purposes.
+ banner("Metadata immediately before failover");
+ doMetasave(nn2);
+
+
+ // Transition nn2 to active even though nn1 still thinks it's active
+ banner("Failing to NN2 but let NN1 continue to think it's active");
+ NameNodeAdapter.abortEditLogs(nn1);
+ NameNodeAdapter.enterSafeMode(nn1, false);
+
+
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+ cluster.transitionToActive(1);
+
+ // Check that the standby picked up the replication change.
+ assertEquals(1,
+ nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
+
+ // Dump some info for debugging purposes.
+ banner("Metadata immediately after failover");
+ doMetasave(nn2);
+
+ banner("Triggering heartbeats and block reports so that fencing is completed");
+ cluster.triggerHeartbeats();
+ cluster.triggerBlockReports();
+
+ banner("Metadata after nodes have all block-reported");
+ doMetasave(nn2);
+
+ // The block should no longer be postponed.
+ assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
+
+ // Wait for NN2 to enact its deletions (replication monitor has to run, etc)
+ BlockManagerTestUtil.computeInvalidationWork(
+ nn2.getNamesystem().getBlockManager());
+
+ HATestUtil.waitForNNToIssueDeletions(nn2);
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+ assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
+ assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
+
+ banner("Making sure the file is still readable");
+ FileSystem fs2 = cluster.getFileSystem(1);
+ DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
+ }
+
+ /**
+ * Regression test for HDFS-2742. The issue in this bug was:
+ * - DN does a block report while file is open. This BR contains
+ * the block in RBW state.
+ * - Standby queues the RBW state in PendingDatanodeMessages
+ * - Standby processes edit logs during failover. Before fixing
+ * this bug, it was mistakenly applying the RBW reported state
+ * after the block had been completed, causing the block to get
+ * marked corrupt. Instead, we should now be applying the RBW
+ * message on OP_ADD, and then the FINALIZED message on OP_CLOSE.
+ */
+ @Test
+ public void testBlockReportsWhileFileBeingWritten() throws Exception {
+ FSDataOutputStream out = fs.create(TEST_FILE_PATH);
+ try {
+ AppendTestUtil.write(out, 0, 10);
+ out.hflush();
+
+ // Block report will include the RBW replica, but will be
+ // queued on the StandbyNode.
+ cluster.triggerBlockReports();
+
+ } finally {
+ IOUtils.closeStream(out);
+ }
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ // Verify that no replicas are marked corrupt, and that the
+ // file is readable from the failed-over standby.
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager());
+ assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks());
+ assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks());
+
+ DFSTestUtil.readFile(fs, TEST_FILE_PATH);
+ }
+
+ /**
+ * Test that, when a block is re-opened for append, the related
+ * datanode messages are correctly queued by the SBN because
+ * they have future states and genstamps.
+ */
+ @Test
+ public void testQueueingWithAppend() throws Exception {
+ int numQueued = 0;
+ int numDN = cluster.getDataNodes().size();
+
+ FSDataOutputStream out = fs.create(TEST_FILE_PATH);
+ try {
+ AppendTestUtil.write(out, 0, 10);
+ out.hflush();
+
+ // Opening the file will report RBW replicas, but will be
+ // queued on the StandbyNode.
+ numQueued += numDN; // RBW messages
+ } finally {
+ IOUtils.closeStream(out);
+ numQueued += numDN; // blockReceived messages
+ }
+
+ cluster.triggerBlockReports();
+ numQueued += numDN;
+
+ try {
+ out = fs.append(TEST_FILE_PATH);
+ AppendTestUtil.write(out, 10, 10);
+ // RBW replicas once it's opened for append
+ numQueued += numDN;
+
+ } finally {
+ IOUtils.closeStream(out);
+ numQueued += numDN; // blockReceived
+ }
+
+ cluster.triggerBlockReports();
+ numQueued += numDN;
+
+ assertEquals(numQueued, cluster.getNameNode(1).getNamesystem().
+ getPendingDataNodeMessageCount());
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ // Verify that no replicas are marked corrupt, and that the
+ // file is readable from the failed-over standby.
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager());
+ assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks());
+ assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks());
+
+ AppendTestUtil.check(fs, TEST_FILE_PATH, 20);
+ }
+
+ /**
+ * Another regression test for HDFS-2742. This tests the following sequence:
+ * - DN does a block report while file is open. This BR contains
+ * the block in RBW state.
+ * - The block report is delayed in reaching the standby.
+ * - The file is closed.
+ * - The standby processes the OP_ADD and OP_CLOSE operations before
+ * the RBW block report arrives.
+ * - The standby should not mark the block as corrupt.
+ */
+ @Test
+ public void testRBWReportArrivesAfterEdits() throws Exception {
+ final CountDownLatch brFinished = new CountDownLatch(1);
+ DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG) {
+ @Override
+ protected Object passThrough(InvocationOnMock invocation)
+ throws Throwable {
+ try {
+ return super.passThrough(invocation);
+ } finally {
+ // inform the test that our block report went through.
+ brFinished.countDown();
+ }
+ }
+ };
+
+ FSDataOutputStream out = fs.create(TEST_FILE_PATH);
+ try {
+ AppendTestUtil.write(out, 0, 10);
+ out.hflush();
+
+ DataNode dn = cluster.getDataNodes().get(0);
+ DatanodeProtocolClientSideTranslatorPB spy =
+ DataNodeAdapter.spyOnBposToNN(dn, nn2);
+
+ Mockito.doAnswer(delayer)
+ .when(spy).blockReport(
+ Mockito.<DatanodeRegistration>anyObject(),
+ Mockito.anyString(),
+ Mockito.<StorageBlockReport[]>anyObject());
+ dn.scheduleAllBlockReport(0);
+ delayer.waitForCall();
+
+ } finally {
+ IOUtils.closeStream(out);
+ }
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ delayer.proceed();
+ brFinished.await();
+
+ // Verify that no replicas are marked corrupt, and that the
+ // file is readable from the failed-over standby.
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager());
+ assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks());
+ assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks());
+
+ DFSTestUtil.readFile(fs, TEST_FILE_PATH);
+ }
+
+ /**
+ * Print a big banner in the test log to make debug easier.
+ */
+ private void banner(String string) {
+ LOG.info("\n\n\n\n================================================\n" +
+ string + "\n" +
+ "==================================================\n\n");
+ }
+
+ private void doMetasave(NameNode nn2) {
+ nn2.getNamesystem().writeLock();
+ try {
+ PrintWriter pw = new PrintWriter(System.err);
+ nn2.getNamesystem().getBlockManager().metaSave(pw);
+ pw.flush();
+ } finally {
+ nn2.getNamesystem().writeUnlock();
+ }
+ }
+
+ private void waitForTrueReplication(final MiniDFSCluster cluster,
+ final ExtendedBlock block, final int waitFor) throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ try {
+ return getTrueReplication(cluster, block) == waitFor;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }, 500, 10000);
+ }
+
+ private int getTrueReplication(MiniDFSCluster cluster, ExtendedBlock block)
+ throws IOException {
+ int count = 0;
+ for (DataNode dn : cluster.getDataNodes()) {
+ if (DataNodeTestUtils.getFSDataset(dn).getStoredBlock(
+ block.getBlockPoolId(), block.getBlockId()) != null) {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * A BlockPlacementPolicy which, rather than using space available, makes
+ * random decisions about which excess replica to delete. This is because,
+ * in the test cases, the two NNs will usually (but not quite always)
+ * make the same decision of which replica to delete. The fencing issues
+ * are exacerbated when the two NNs make different decisions, which can
+ * happen in "real life" when they have slightly out-of-sync heartbeat
+ * information regarding disk usage.
+ */
+ public static class RandomDeleterPolicy extends BlockPlacementPolicyDefault {
+
+ public RandomDeleterPolicy() {
+ super();
+ }
+
+ @Override
+ public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode,
+ Block block, short replicationFactor,
+ Collection<DatanodeDescriptor> first,
+ Collection<DatanodeDescriptor> second) {
+
+ Collection<DatanodeDescriptor> chooseFrom =
+ !first.isEmpty() ? first : second;
+
+ List<DatanodeDescriptor> l = Lists.newArrayList(chooseFrom);
+ return l.get(DFSUtil.getRandom().nextInt(l.size()));
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java
new file mode 100644
index 0000000..95d5eb9
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDNFencingWithReplication.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.ipc.Server;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
+import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
+import org.apache.log4j.Level;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.base.Supplier;
+
+
+/**
+ * Stress-test for potential bugs when replication is changing
+ * on blocks during a failover.
+ */
+public class TestDNFencingWithReplication {
+ static {
+ ((Log4JLogger)FSNamesystem.auditLog).getLogger().setLevel(Level.WARN);
+ ((Log4JLogger)Server.LOG).getLogger().setLevel(Level.FATAL);
+ ((Log4JLogger)LogFactory.getLog(
+ "org.apache.hadoop.io.retry.RetryInvocationHandler"))
+ .getLogger().setLevel(Level.FATAL);
+ }
+
+ private static final int NUM_THREADS = 20;
+ // How long should the test try to run for. In practice
+ // it runs for ~20-30s longer than this constant due to startup/
+ // shutdown time.
+ private static final long RUNTIME = 35000;
+ private static final int BLOCK_SIZE = 1024;
+
+ private static class ReplicationToggler extends RepeatingTestThread {
+ private final FileSystem fs;
+ private final Path path;
+
+ public ReplicationToggler(TestContext ctx, FileSystem fs, Path p) {
+ super(ctx);
+ this.fs = fs;
+ this.path = p;
+ }
+
+ @Override
+ public void doAnAction() throws Exception {
+ fs.setReplication(path, (short)1);
+ waitForReplicas(1);
+ fs.setReplication(path, (short)2);
+ waitForReplicas(2);
+ }
+
+ private void waitForReplicas(final int replicas) throws Exception {
+ try {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ try {
+ BlockLocation[] blocks = fs.getFileBlockLocations(path, 0, 10);
+ Assert.assertEquals(1, blocks.length);
+ return blocks[0].getHosts().length == replicas;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }, 100, 60000);
+ } catch (TimeoutException te) {
+ throw new IOException("Timed out waiting for " + replicas + " replicas " +
+ "on path " + path);
+ }
+ }
+
+ public String toString() {
+ return "Toggler for " + path;
+ }
+ }
+
+ @Test
+ public void testFencingStress() throws Exception {
+ HAStressTestHarness harness = new HAStressTestHarness();
+ harness.conf.setInt(
+ DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000);
+
+ final MiniDFSCluster cluster = harness.startCluster();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+
+ FileSystem fs = harness.getFailoverFs();
+ TestContext togglers = new TestContext();
+ for (int i = 0; i < NUM_THREADS; i++) {
+ Path p = new Path("/test-" + i);
+ DFSTestUtil.createFile(fs, p, BLOCK_SIZE*10, (short)3, (long)i);
+ togglers.addThread(new ReplicationToggler(togglers, fs, p));
+ }
+
+ // Start a separate thread which will make sure that replication
+ // happens quickly by triggering deletion reports and replication
+ // work calculation frequently.
+ harness.addReplicationTriggerThread(500);
+ harness.addFailoverThread(5000);
+ harness.startThreads();
+ togglers.startThreads();
+
+ togglers.waitFor(RUNTIME);
+ togglers.stop();
+ harness.stopThreads();
+
+ // CHeck that the files can be read without throwing
+ for (int i = 0; i < NUM_THREADS; i++) {
+ Path p = new Path("/test-" + i);
+ DFSTestUtil.readFile(fs, p);
+ }
+ } finally {
+ System.err.println("===========================\n\n\n\n");
+ harness.shutdown();
+ }
+
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java
new file mode 100644
index 0000000..561e4d6
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestDelegationTokensWithHA.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.URI;
+import java.security.PrivilegedExceptionAction;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSelector;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.google.common.base.Joiner;
+
+/**
+ * Test case for client support of delegation tokens in an HA cluster.
+ * See HDFS-2904 for more info.
+ **/
+public class TestDelegationTokensWithHA {
+ private static Configuration conf = new Configuration();
+ private static final Log LOG =
+ LogFactory.getLog(TestDelegationTokensWithHA.class);
+ private static MiniDFSCluster cluster;
+ private static NameNode nn0;
+ private static NameNode nn1;
+ private static FileSystem fs;
+ private static DelegationTokenSecretManager dtSecretManager;
+ private static DistributedFileSystem dfs;
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+ conf.set("hadoop.security.auth_to_local",
+ "RULE:[2:$1@$0](JobTracker@.*FOO.COM)s/@.*//" + "DEFAULT");
+
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ cluster.waitActive();
+
+ nn0 = cluster.getNameNode(0);
+ nn1 = cluster.getNameNode(1);
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+ dfs = (DistributedFileSystem)fs;
+
+ cluster.transitionToActive(0);
+ dtSecretManager = NameNodeAdapter.getDtSecretManager(
+ nn0.getNamesystem());
+ }
+
+ @AfterClass
+ public static void shutdownCluster() throws IOException {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+
+ @Test
+ public void testDelegationTokenDFSApi() throws Exception {
+ Token<DelegationTokenIdentifier> token = dfs.getDelegationToken("JobTracker");
+ DelegationTokenIdentifier identifier = new DelegationTokenIdentifier();
+ byte[] tokenId = token.getIdentifier();
+ identifier.readFields(new DataInputStream(
+ new ByteArrayInputStream(tokenId)));
+
+ // Ensure that it's present in the NN's secret manager and can
+ // be renewed directly from there.
+ LOG.info("A valid token should have non-null password, " +
+ "and should be renewed successfully");
+ assertTrue(null != dtSecretManager.retrievePassword(identifier));
+ dtSecretManager.renewToken(token, "JobTracker");
+
+ // Use the client conf with the failover info present to check
+ // renewal.
+ Configuration clientConf = dfs.getConf();
+ doRenewOrCancel(token, clientConf, TokenTestAction.RENEW);
+
+ // Using a configuration that doesn't have the logical nameservice
+ // configured should result in a reasonable error message.
+ Configuration emptyConf = new Configuration();
+ try {
+ doRenewOrCancel(token, emptyConf, TokenTestAction.RENEW);
+ fail("Did not throw trying to renew with an empty conf!");
+ } catch (IOException ioe) {
+ GenericTestUtils.assertExceptionContains(
+ "Unable to map logical nameservice URI", ioe);
+ }
+
+
+ // Ensure that the token can be renewed again after a failover.
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+ doRenewOrCancel(token, clientConf, TokenTestAction.RENEW);
+
+ doRenewOrCancel(token, clientConf, TokenTestAction.CANCEL);
+ }
+
+ @SuppressWarnings("deprecation")
+ @Test
+ public void testDelegationTokenWithDoAs() throws Exception {
+ final Token<DelegationTokenIdentifier> token =
+ dfs.getDelegationToken("JobTracker");
+ final UserGroupInformation longUgi = UserGroupInformation
+ .createRemoteUser("JobTracker/foo.com@FOO.COM");
+ final UserGroupInformation shortUgi = UserGroupInformation
+ .createRemoteUser("JobTracker");
+ longUgi.doAs(new PrivilegedExceptionAction<Void>() {
+ public Void run() throws Exception {
+ DistributedFileSystem dfs = (DistributedFileSystem)
+ HATestUtil.configureFailoverFs(cluster, conf);
+ // try renew with long name
+ dfs.renewDelegationToken(token);
+ return null;
+ }
+ });
+ shortUgi.doAs(new PrivilegedExceptionAction<Void>() {
+ public Void run() throws Exception {
+ DistributedFileSystem dfs = (DistributedFileSystem)
+ HATestUtil.configureFailoverFs(cluster, conf);
+ dfs.renewDelegationToken(token);
+ return null;
+ }
+ });
+ longUgi.doAs(new PrivilegedExceptionAction<Void>() {
+ public Void run() throws Exception {
+ DistributedFileSystem dfs = (DistributedFileSystem)
+ HATestUtil.configureFailoverFs(cluster, conf);
+ // try cancel with long name
+ dfs.cancelDelegationToken(token);
+ return null;
+ }
+ });
+ }
+
+ @Test
+ public void testHAUtilClonesDelegationTokens() throws Exception {
+ final Token<DelegationTokenIdentifier> token =
+ dfs.getDelegationToken("test");
+
+ UserGroupInformation ugi = UserGroupInformation.createRemoteUser("test");
+
+ URI haUri = new URI("hdfs://my-ha-uri/");
+ token.setService(HAUtil.buildTokenServiceForLogicalUri(haUri));
+ ugi.addToken(token);
+ HAUtil.cloneDelegationTokenForLogicalUri(ugi, haUri, nn0.getNameNodeAddress());
+ HAUtil.cloneDelegationTokenForLogicalUri(ugi, haUri, nn1.getNameNodeAddress());
+
+ Collection<Token<? extends TokenIdentifier>> tokens = ugi.getTokens();
+ assertEquals(3, tokens.size());
+
+ LOG.info("Tokens:\n" + Joiner.on("\n").join(tokens));
+
+ // check that the token selected for one of the physical IPC addresses
+ // matches the one we received
+ InetSocketAddress addr = nn0.getNameNodeAddress();
+ Text ipcDtService = new Text(
+ addr.getAddress().getHostAddress() + ":" + addr.getPort());
+ Token<DelegationTokenIdentifier> token2 =
+ DelegationTokenSelector.selectHdfsDelegationToken(ipcDtService, ugi);
+ assertNotNull(token2);
+ assertArrayEquals(token.getIdentifier(), token2.getIdentifier());
+ assertArrayEquals(token.getPassword(), token2.getPassword());
+ }
+
+ enum TokenTestAction {
+ RENEW, CANCEL;
+ }
+
+ private static void doRenewOrCancel(
+ final Token<DelegationTokenIdentifier> token, final Configuration conf,
+ final TokenTestAction action)
+ throws IOException, InterruptedException {
+ UserGroupInformation.createRemoteUser("JobTracker").doAs(
+ new PrivilegedExceptionAction<Void>() {
+ @Override
+ public Void run() throws Exception {
+ switch (action) {
+ case RENEW:
+ token.renew(conf);
+ break;
+ case CANCEL:
+ token.cancel(conf);
+ break;
+ default:
+ fail("bad action:" + action);
+ }
+ return null;
+ }
+ });
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java
new file mode 100644
index 0000000..bc5c487
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogTailer.java
@@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.List;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+import org.junit.Test;
+
+import com.google.common.base.Supplier;
+
+public class TestEditLogTailer {
+
+ private static final String DIR_PREFIX = "/dir";
+ private static final int DIRS_TO_MAKE = 20;
+ static final long SLEEP_TIME = 1000;
+ static final long NN_LAG_TIMEOUT = 10 * 1000;
+
+ static {
+ ((Log4JLogger)FSImage.LOG).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)EditLogTailer.LOG).getLogger().setLevel(Level.ALL);
+ }
+
+ @Test
+ public void testTailer() throws IOException, InterruptedException,
+ ServiceFailedException {
+ Configuration conf = new HdfsConfiguration();
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+
+ HAUtil.setAllowStandbyReads(conf, true);
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ cluster.waitActive();
+
+ cluster.transitionToActive(0);
+
+ NameNode nn1 = cluster.getNameNode(0);
+ NameNode nn2 = cluster.getNameNode(1);
+ try {
+ for (int i = 0; i < DIRS_TO_MAKE / 2; i++) {
+ NameNodeAdapter.mkdirs(nn1, getDirPath(i),
+ new PermissionStatus("test","test", new FsPermission((short)00755)),
+ true);
+ }
+
+ HATestUtil.waitForStandbyToCatchUp(nn1, nn2);
+
+ for (int i = 0; i < DIRS_TO_MAKE / 2; i++) {
+ assertTrue(NameNodeAdapter.getFileInfo(nn2,
+ getDirPath(i), false).isDir());
+ }
+
+ for (int i = DIRS_TO_MAKE / 2; i < DIRS_TO_MAKE; i++) {
+ NameNodeAdapter.mkdirs(nn1, getDirPath(i),
+ new PermissionStatus("test","test", new FsPermission((short)00755)),
+ true);
+ }
+
+ HATestUtil.waitForStandbyToCatchUp(nn1, nn2);
+
+ for (int i = DIRS_TO_MAKE / 2; i < DIRS_TO_MAKE; i++) {
+ assertTrue(NameNodeAdapter.getFileInfo(nn2,
+ getDirPath(i), false).isDir());
+ }
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ @Test
+ public void testNN0TriggersLogRolls() throws Exception {
+ testStandbyTriggersLogRolls(0);
+ }
+
+ @Test
+ public void testNN1TriggersLogRolls() throws Exception {
+ testStandbyTriggersLogRolls(1);
+ }
+
+ private static void testStandbyTriggersLogRolls(int activeIndex)
+ throws Exception {
+ Configuration conf = new Configuration();
+ // Roll every 1s
+ conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+
+ // Have to specify IPC ports so the NNs can talk to each other.
+ MiniDFSNNTopology topology = new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
+ .addNN(new MiniDFSNNTopology.NNConf("nn1").setIpcPort(10001))
+ .addNN(new MiniDFSNNTopology.NNConf("nn2").setIpcPort(10002)));
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(topology)
+ .numDataNodes(0)
+ .build();
+ try {
+ cluster.transitionToActive(activeIndex);
+ waitForLogRollInSharedDir(cluster, 3);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ private static String getDirPath(int suffix) {
+ return DIR_PREFIX + suffix;
+ }
+
+ private static void waitForLogRollInSharedDir(MiniDFSCluster cluster,
+ long startTxId) throws Exception {
+ URI sharedUri = cluster.getSharedEditsDir(0, 1);
+ File sharedDir = new File(sharedUri.getPath(), "current");
+ final File expectedLog = new File(sharedDir,
+ NNStorage.getInProgressEditsFileName(startTxId));
+
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ return expectedLog.exists();
+ }
+ }, 100, 10000);
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java
new file mode 100644
index 0000000..a245301
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestEditLogsDuringFailover.java
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+
+/**
+ * Test cases for the handling of edit logs during failover
+ * and startup of the standby node.
+ */
+public class TestEditLogsDuringFailover {
+ private static final Log LOG =
+ LogFactory.getLog(TestEditLogsDuringFailover.class);
+ private static final int NUM_DIRS_IN_LOG = 5;
+
+ @Test
+ public void testStartup() throws Exception {
+ Configuration conf = new Configuration();
+ HAUtil.setAllowStandbyReads(conf, true);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ try {
+ // During HA startup, both nodes should be in
+ // standby and we shouldn't have any edits files
+ // in any edits directory!
+ List<URI> allDirs = Lists.newArrayList();
+ allDirs.addAll(cluster.getNameDirs(0));
+ allDirs.addAll(cluster.getNameDirs(1));
+ allDirs.add(cluster.getSharedEditsDir(0, 1));
+ assertNoEditFiles(allDirs);
+
+ // Set the first NN to active, make sure it creates edits
+ // in its own dirs and the shared dir. The standby
+ // should still have no edits!
+ cluster.getNameNode(0).getRpcServer().transitionToActive();
+
+ assertEditFiles(cluster.getNameDirs(0),
+ NNStorage.getInProgressEditsFileName(1));
+ assertEditFiles(
+ Collections.singletonList(cluster.getSharedEditsDir(0, 1)),
+ NNStorage.getInProgressEditsFileName(1));
+ assertNoEditFiles(cluster.getNameDirs(1));
+
+ cluster.getNameNode(0).getRpcServer().mkdirs("/test",
+ FsPermission.createImmutable((short)0755), true);
+
+ // Restarting the standby should not finalize any edits files
+ // in the shared directory when it starts up!
+ cluster.restartNameNode(1);
+
+ assertEditFiles(cluster.getNameDirs(0),
+ NNStorage.getInProgressEditsFileName(1));
+ assertEditFiles(
+ Collections.singletonList(cluster.getSharedEditsDir(0, 1)),
+ NNStorage.getInProgressEditsFileName(1));
+ assertNoEditFiles(cluster.getNameDirs(1));
+
+ // Additionally it should not have applied any in-progress logs
+ // at start-up -- otherwise, it would have read half-way into
+ // the current log segment, and on the next roll, it would have to
+ // either replay starting in the middle of the segment (not allowed)
+ // or double-replay the edits (incorrect).
+ assertNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test", true));
+
+ cluster.getNameNode(0).getRpcServer().mkdirs("/test2",
+ FsPermission.createImmutable((short)0755), true);
+
+ // If we restart NN0, it'll come back as standby, and we can
+ // transition NN1 to active and make sure it reads edits correctly at this point.
+ cluster.restartNameNode(0);
+ cluster.getNameNode(1).getRpcServer().transitionToActive();
+
+ // NN1 should have both the edits that came before its restart, and the edits that
+ // came after its restart.
+ assertNotNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test", true));
+ assertNotNull(NameNodeAdapter.getFileInfo(cluster.getNameNode(1), "/test2", true));
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ @Test
+ public void testFailoverFinalizesAndReadsInProgress() throws Exception {
+ Configuration conf = new Configuration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ try {
+ // Create a fake in-progress edit-log in the shared directory
+ URI sharedUri = cluster.getSharedEditsDir(0, 1);
+ File sharedDir = new File(sharedUri.getPath(), "current");
+ FSImageTestUtil.createAbortedLogWithMkdirs(sharedDir, NUM_DIRS_IN_LOG, 1);
+ assertEditFiles(Collections.singletonList(sharedUri),
+ NNStorage.getInProgressEditsFileName(1));
+
+ // Transition one of the NNs to active
+ cluster.getNameNode(0).getRpcServer().transitionToActive();
+
+ // In the transition to active, it should have read the log -- and
+ // hence see one of the dirs we made in the fake log.
+ String testPath = "/dir" + NUM_DIRS_IN_LOG;
+ assertNotNull(cluster.getNameNode(0).getRpcServer().getFileInfo(testPath));
+
+ // It also should have finalized that log in the shared directory and started
+ // writing to a new one at the next txid.
+ assertEditFiles(Collections.singletonList(sharedUri),
+ NNStorage.getFinalizedEditsFileName(1, NUM_DIRS_IN_LOG + 1),
+ NNStorage.getInProgressEditsFileName(NUM_DIRS_IN_LOG + 2));
+ } finally {
+ cluster.shutdown();
+ }
+
+ }
+
+ /**
+ * Check that no edits files are present in the given storage dirs.
+ */
+ private void assertNoEditFiles(Iterable<URI> dirs) throws IOException {
+ assertEditFiles(dirs, new String[]{});
+ }
+
+ /**
+ * Check that the given list of edits files are present in the given storage
+ * dirs.
+ */
+ private void assertEditFiles(Iterable<URI> dirs, String ... files)
+ throws IOException {
+ for (URI u : dirs) {
+ File editDirRoot = new File(u.getPath());
+ File editDir = new File(editDirRoot, "current");
+ GenericTestUtils.assertExists(editDir);
+ if (files.length == 0) {
+ LOG.info("Checking no edit files exist in " + editDir);
+ } else {
+ LOG.info("Checking for following edit files in " + editDir
+ + ": " + Joiner.on(",").join(files));
+ }
+
+ GenericTestUtils.assertGlobEquals(editDir, "edits_.*", files);
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java
new file mode 100644
index 0000000..cc9552a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.base.Joiner;
+
+public class TestFailureOfSharedDir {
+
+ private static final Log LOG = LogFactory.getLog(TestFailureOfSharedDir.class);
+
+ /**
+ * Test that the shared edits dir is automatically added to the list of edits
+ * dirs that are marked required.
+ */
+ @Test
+ public void testSharedDirIsAutomaticallyMarkedRequired()
+ throws URISyntaxException {
+ URI foo = new URI("file:/foo");
+ URI bar = new URI("file:/bar");
+ Configuration conf = new Configuration();
+ conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY, Joiner.on(",").join(foo, bar));
+ conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY, foo.toString());
+ assertFalse(FSNamesystem.getRequiredNamespaceEditsDirs(conf).contains(
+ bar));
+ conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY, bar.toString());
+ Collection<URI> requiredEditsDirs = FSNamesystem
+ .getRequiredNamespaceEditsDirs(conf);
+ assertTrue(Joiner.on(",").join(requiredEditsDirs) + " does not contain " + bar,
+ requiredEditsDirs.contains(bar));
+ }
+
+ /**
+ * Multiple shared edits directories is an invalid configuration.
+ */
+ @Test
+ public void testMultipleSharedDirsFails() throws Exception {
+ Configuration conf = new Configuration();
+ URI sharedA = new URI("file:///shared-A");
+ URI sharedB = new URI("file:///shared-B");
+ URI localA = new URI("file:///local-A");
+
+ conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY,
+ Joiner.on(",").join(sharedA,sharedB));
+ conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
+ localA.toString());
+
+ try {
+ FSNamesystem.getNamespaceEditsDirs(conf);
+ fail("Allowed multiple shared edits directories");
+ } catch (IOException ioe) {
+ assertEquals("Multiple shared edits directories are not yet supported",
+ ioe.getMessage());
+ }
+ }
+
+ /**
+ * Make sure that the shared edits dirs are listed before non-shared dirs
+ * when the configuration is parsed. This ensures that the shared journals
+ * are synced before the local ones.
+ */
+ @Test
+ public void testSharedDirsComeFirstInEditsList() throws Exception {
+ Configuration conf = new Configuration();
+ URI sharedA = new URI("file:///shared-A");
+ URI localA = new URI("file:///local-A");
+ URI localB = new URI("file:///local-B");
+ URI localC = new URI("file:///local-C");
+
+ conf.set(DFSConfigKeys.DFS_NAMENODE_SHARED_EDITS_DIR_KEY,
+ sharedA.toString());
+ // List them in reverse order, to make sure they show up in
+ // the order listed, regardless of lexical sort order.
+ conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
+ Joiner.on(",").join(localC, localB, localA));
+ List<URI> dirs = FSNamesystem.getNamespaceEditsDirs(conf);
+ assertEquals(
+ "Shared dirs should come first, then local dirs, in the order " +
+ "they were listed in the configuration.",
+ Joiner.on(",").join(sharedA, localC, localB, localA),
+ Joiner.on(",").join(dirs));
+ }
+
+ /**
+ * Test that marking the shared edits dir as being "required" causes the NN to
+ * fail if that dir can't be accessed.
+ */
+ @Test
+ public void testFailureOfSharedDir() throws Exception {
+ Configuration conf = new Configuration();
+
+ // The shared edits dir will automatically be marked required.
+ MiniDFSCluster cluster = null;
+ File sharedEditsDir = null;
+ try {
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ assertTrue(fs.mkdirs(new Path("/test1")));
+
+ // Blow away the shared edits dir.
+ Runtime mockRuntime = Mockito.mock(Runtime.class);
+ URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
+ sharedEditsDir = new File(sharedEditsUri);
+ assertEquals(0, FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "-w",
+ true));
+
+ NameNode nn0 = cluster.getNameNode(0);
+ nn0.getNamesystem().getFSImage().getEditLog().getJournalSet()
+ .setRuntimeForTesting(mockRuntime);
+ try {
+ // Make sure that subsequent operations on the NN fail.
+ nn0.getRpcServer().rollEditLog();
+ fail("Succeeded in rolling edit log despite shared dir being deleted");
+ } catch (IOException ioe) {
+ GenericTestUtils.assertExceptionContains(
+ "Unable to start log segment 4: too few journals successfully started",
+ ioe);
+ // By current policy the NN should exit upon this error.
+ // exit() should be called once, but since it is mocked, exit gets
+ // called once during FSEditsLog.endCurrentLogSegment() and then after
+ // that during FSEditsLog.startLogSegment(). So the check is atLeast(1)
+ Mockito.verify(mockRuntime, Mockito.atLeastOnce()).exit(
+ Mockito.anyInt());
+ LOG.info("Got expected exception", ioe);
+ }
+
+ // Check that none of the edits dirs rolled, since the shared edits
+ // dir didn't roll. Regression test for HDFS-2874.
+ for (URI editsUri : cluster.getNameEditsDirs(0)) {
+ if (editsUri.equals(sharedEditsUri)) {
+ continue;
+ }
+ File editsDir = new File(editsUri.getPath());
+ File curDir = new File(editsDir, "current");
+ GenericTestUtils.assertGlobEquals(curDir,
+ "edits_.*",
+ NNStorage.getInProgressEditsFileName(1));
+ }
+ } finally {
+ if (sharedEditsDir != null) {
+ // without this test cleanup will fail
+ FileUtil.chmod(sharedEditsDir.getAbsolutePath(), "+w", true);
+ }
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java
new file mode 100644
index 0000000..7bc2d8e
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureToReadEdits.java
@@ -0,0 +1,326 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Matchers.anyBoolean;
+import static org.mockito.Matchers.anyInt;
+import static org.mockito.Matchers.anyLong;
+import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.LinkedList;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.ha.ServiceFailedException;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.EditLogInputException;
+import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+
+import com.google.common.collect.ImmutableList;
+
+public class TestFailureToReadEdits {
+
+ private static final Log LOG = LogFactory.getLog(TestFailureToReadEdits.class);
+
+ private static final String TEST_DIR1 = "/test1";
+ private static final String TEST_DIR2 = "/test2";
+ private static final String TEST_DIR3 = "/test3";
+
+ private Configuration conf;
+ private Runtime mockRuntime = mock(Runtime.class);
+ private MiniDFSCluster cluster;
+ private NameNode nn0;
+ private NameNode nn1;
+ private FileSystem fs;
+
+ @Before
+ public void setUpCluster() throws Exception {
+ conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY, 10);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ HAUtil.setAllowStandbyReads(conf, true);
+
+ MiniDFSNNTopology topology = new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
+ .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001))
+ .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002)));
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(topology)
+ .numDataNodes(0)
+ .build();
+
+ cluster.waitActive();
+
+ nn0 = cluster.getNameNode(0);
+ nn1 = cluster.getNameNode(1);
+ nn1.getNamesystem().getEditLogTailer().setRuntime(mockRuntime);
+
+ cluster.transitionToActive(0);
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+ }
+
+ @After
+ public void tearDownCluster() throws Exception {
+ if (fs != null) {
+ fs.close();
+ }
+
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test that the standby NN won't double-replay earlier edits if it encounters
+ * a failure to read a later edit.
+ */
+ @Test
+ public void testFailuretoReadEdits() throws Exception {
+ assertTrue(fs.mkdirs(new Path(TEST_DIR1)));
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // If these two ops are applied twice, the first op will throw an
+ // exception the second time its replayed.
+ fs.setOwner(new Path(TEST_DIR1), "foo", "bar");
+ assertTrue(fs.delete(new Path(TEST_DIR1), true));
+
+ // This op should get applied just fine.
+ assertTrue(fs.mkdirs(new Path(TEST_DIR2)));
+
+ // This is the op the mocking will cause to fail to be read.
+ assertTrue(fs.mkdirs(new Path(TEST_DIR3)));
+
+ LimitedEditLogAnswer answer = causeFailureOnEditLogRead();
+
+ try {
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ fail("Standby fully caught up, but should not have been able to");
+ } catch (HATestUtil.CouldNotCatchUpException e) {
+ verify(mockRuntime, times(0)).exit(anyInt());
+ }
+
+ // Null because it was deleted.
+ assertNull(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR1, false));
+ // Should have been successfully created.
+ assertTrue(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR2, false).isDir());
+ // Null because it hasn't been created yet.
+ assertNull(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR3, false));
+
+ // Now let the standby read ALL the edits.
+ answer.setThrowExceptionOnRead(false);
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // Null because it was deleted.
+ assertNull(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR1, false));
+ // Should have been successfully created.
+ assertTrue(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR2, false).isDir());
+ // Should now have been successfully created.
+ assertTrue(NameNodeAdapter.getFileInfo(nn1,
+ TEST_DIR3, false).isDir());
+ }
+
+ /**
+ * Test the following case:
+ * 1. SBN is reading a finalized edits file when NFS disappears halfway
+ * through (or some intermittent error happens)
+ * 2. SBN performs a checkpoint and uploads it to the NN
+ * 3. NN receives a checkpoint that doesn't correspond to the end of any log
+ * segment
+ * 4. Both NN and SBN should be able to restart at this point.
+ *
+ * This is a regression test for HDFS-2766.
+ */
+ @Test
+ public void testCheckpointStartingMidEditsFile() throws Exception {
+ assertTrue(fs.mkdirs(new Path(TEST_DIR1)));
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // Once the standby catches up, it should notice that it needs to
+ // do a checkpoint and save one to its local directories.
+ HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 3));
+
+ // It should also upload it back to the active.
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3));
+
+ causeFailureOnEditLogRead();
+
+ assertTrue(fs.mkdirs(new Path(TEST_DIR2)));
+ assertTrue(fs.mkdirs(new Path(TEST_DIR3)));
+
+ try {
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ fail("Standby fully caught up, but should not have been able to");
+ } catch (HATestUtil.CouldNotCatchUpException e) {
+ verify(mockRuntime, times(0)).exit(anyInt());
+ }
+
+ // 5 because we should get OP_START_LOG_SEGMENT and one successful OP_MKDIR
+ HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 3, 5));
+
+ // It should also upload it back to the active.
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3, 5));
+
+ // Restart the active NN
+ cluster.restartNameNode(0);
+
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3, 5));
+
+ FileSystem fs0 = null;
+ try {
+ // Make sure that when the active restarts, it loads all the edits.
+ fs0 = FileSystem.get(NameNode.getUri(nn0.getNameNodeAddress()),
+ conf);
+
+ assertTrue(fs0.exists(new Path(TEST_DIR1)));
+ assertTrue(fs0.exists(new Path(TEST_DIR2)));
+ assertTrue(fs0.exists(new Path(TEST_DIR3)));
+ } finally {
+ if (fs0 != null)
+ fs0.close();
+ }
+ }
+
+ /**
+ * Ensure that the standby fails to become active if it cannot read all
+ * available edits in the shared edits dir when it is transitioning to active
+ * state.
+ */
+ @Test
+ public void testFailureToReadEditsOnTransitionToActive() throws Exception {
+ assertTrue(fs.mkdirs(new Path(TEST_DIR1)));
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // It should also upload it back to the active.
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 3));
+
+ causeFailureOnEditLogRead();
+
+ assertTrue(fs.mkdirs(new Path(TEST_DIR2)));
+ assertTrue(fs.mkdirs(new Path(TEST_DIR3)));
+
+ try {
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ fail("Standby fully caught up, but should not have been able to");
+ } catch (HATestUtil.CouldNotCatchUpException e) {
+ verify(mockRuntime, times(0)).exit(anyInt());
+ }
+
+ // Shutdown the active NN.
+ cluster.shutdownNameNode(0);
+
+ try {
+ // Transition the standby to active.
+ cluster.transitionToActive(1);
+ fail("Standby transitioned to active, but should not have been able to");
+ } catch (ServiceFailedException sfe) {
+ LOG.info("got expected exception: " + sfe.toString(), sfe);
+ assertTrue("Standby failed to catch up for some reason other than "
+ + "failure to read logs", sfe.toString().contains(
+ EditLogInputException.class.getName()));
+ }
+ }
+
+ private LimitedEditLogAnswer causeFailureOnEditLogRead() throws IOException {
+ FSEditLog spyEditLog = spy(nn1.getNamesystem().getEditLogTailer()
+ .getEditLog());
+ LimitedEditLogAnswer answer = new LimitedEditLogAnswer();
+ doAnswer(answer).when(spyEditLog).selectInputStreams(
+ anyLong(), anyLong(), anyBoolean());
+ nn1.getNamesystem().getEditLogTailer().setEditLog(spyEditLog);
+
+ return answer;
+ }
+
+ private static class LimitedEditLogAnswer
+ implements Answer<Collection<EditLogInputStream>> {
+
+ private boolean throwExceptionOnRead = true;
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Collection<EditLogInputStream> answer(InvocationOnMock invocation)
+ throws Throwable {
+ Collection<EditLogInputStream> streams = (Collection<EditLogInputStream>)
+ invocation.callRealMethod();
+
+ if (!throwExceptionOnRead) {
+ return streams;
+ } else {
+ Collection<EditLogInputStream> ret = new LinkedList<EditLogInputStream>();
+ for (EditLogInputStream stream : streams) {
+ EditLogInputStream spyStream = spy(stream);
+ doAnswer(new Answer<FSEditLogOp>() {
+
+ @Override
+ public FSEditLogOp answer(InvocationOnMock invocation)
+ throws Throwable {
+ FSEditLogOp op = (FSEditLogOp) invocation.callRealMethod();
+ if (throwExceptionOnRead &&
+ TEST_DIR3.equals(NameNodeAdapter.getMkdirOpPath(op))) {
+ throw new IOException("failed to read op creating " + TEST_DIR3);
+ } else {
+ return op;
+ }
+ }
+
+ }).when(spyStream).readOp();
+ ret.add(spyStream);
+ }
+ return ret;
+ }
+ }
+
+ public void setThrowExceptionOnRead(boolean throwExceptionOnRead) {
+ this.throwExceptionOnRead = throwExceptionOnRead;
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java
new file mode 100644
index 0000000..e548817
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestGetGroupsWithHA.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.tools.GetGroups;
+import org.apache.hadoop.tools.GetGroupsTestBase;
+import org.apache.hadoop.util.Tool;
+import org.junit.After;
+import org.junit.Before;
+
+public class TestGetGroupsWithHA extends GetGroupsTestBase {
+
+ private MiniDFSCluster cluster;
+
+ @Before
+ public void setUpNameNode() throws IOException {
+ conf = new HdfsConfiguration();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0).build();
+ HATestUtil.setFailoverConfigurations(cluster, conf);
+ }
+
+ @After
+ public void tearDownNameNode() {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ @Override
+ protected Tool getTool(PrintStream o) {
+ return new GetGroups(conf, o);
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java
new file mode 100644
index 0000000..9cd6ab7
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAConfiguration.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+/**
+ * Test cases that the HA configuration is reasonably validated and
+ * interpreted in various places. These should be proper unit tests
+ * which don't start daemons.
+ */
+public class TestHAConfiguration {
+
+ private FSNamesystem fsn = Mockito.mock(FSNamesystem.class);
+
+ @Test
+ public void testCheckpointerValidityChecks() throws Exception {
+ try {
+ Configuration conf = new Configuration();
+ new StandbyCheckpointer(conf, fsn);
+ fail("Bad config did not throw an error");
+ } catch (IllegalArgumentException iae) {
+ GenericTestUtils.assertExceptionContains(
+ "Invalid URI for NameNode address", iae);
+ }
+ }
+
+ private Configuration getHAConf(String nsId, String host1, String host2) {
+ Configuration conf = new Configuration();
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, nsId);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX, nsId),
+ "nn1,nn2");
+ conf.set(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, "nn1");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, "nn1"),
+ host1 + ":12345");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, nsId, "nn2"),
+ host2 + ":12345");
+ return conf;
+ }
+
+ @Test
+ public void testGetOtherNNHttpAddress() {
+ // Use non-local addresses to avoid host address matching
+ Configuration conf = getHAConf("ns1", "1.2.3.1", "1.2.3.2");
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID, "ns1");
+
+ // This is done by the NN before the StandbyCheckpointer is created
+ NameNode.initializeGenericKeys(conf, "ns1", "nn1");
+
+ // Since we didn't configure the HTTP address, and the default is
+ // 0.0.0.0, it should substitute the address from the RPC configuration
+ // above.
+ StandbyCheckpointer checkpointer = new StandbyCheckpointer(conf, fsn);
+ assertEquals("1.2.3.2:" + DFSConfigKeys.DFS_NAMENODE_HTTP_PORT_DEFAULT,
+ checkpointer.getActiveNNAddress());
+ }
+
+ /**
+ * Tests that the namenode edits dirs and shared edits dirs are gotten with
+ * duplicates removed
+ */
+ @Test
+ public void testHAUniqueEditDirs() throws IOException {
+ Configuration conf = new Configuration();
+
+ conf.set(DFS_NAMENODE_EDITS_DIR_KEY, "file://edits/dir, "
+ + "file://edits/shared/dir"); // overlapping
+ conf.set(DFS_NAMENODE_SHARED_EDITS_DIR_KEY, "file://edits/shared/dir");
+
+ // getNamespaceEditsDirs removes duplicates across edits and shared.edits
+ Collection<URI> editsDirs = FSNamesystem.getNamespaceEditsDirs(conf);
+ assertEquals(2, editsDirs.size());
+ }
+
+ /**
+ * Test that the 2NN does not start if given a config with HA NNs.
+ */
+ @Test
+ public void testSecondaryNameNodeDoesNotStart() throws IOException {
+ // Note we're not explicitly setting the nameservice Id in the
+ // config as it is not required to be set and we want to test
+ // that we can determine if HA is enabled when the nameservice Id
+ // is not explicitly defined.
+ Configuration conf = getHAConf("ns1", "1.2.3.1", "1.2.3.2");
+ try {
+ new SecondaryNameNode(conf);
+ fail("Created a 2NN with an HA config");
+ } catch (IOException ioe) {
+ GenericTestUtils.assertExceptionContains(
+ "Cannot use SecondaryNameNode in an HA cluster", ioe);
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java
new file mode 100644
index 0000000..10218f2
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAFsck.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+
+import junit.framework.Assert;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.tools.DFSck;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.log4j.Level;
+import org.junit.Test;
+
+public class TestHAFsck {
+
+ static {
+ ((Log4JLogger)LogFactory.getLog(DFSUtil.class)).getLogger().setLevel(Level.ALL);
+ }
+
+ /**
+ * Test that fsck still works with HA enabled.
+ */
+ @Test
+ public void testHaFsck() throws Exception {
+ Configuration conf = new Configuration();
+
+ // need some HTTP ports
+ MiniDFSNNTopology topology = new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf("ha-nn-uri-0")
+ .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001))
+ .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002)));
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(topology)
+ .numDataNodes(0)
+ .build();
+ FileSystem fs = null;
+ try {
+ cluster.waitActive();
+
+ cluster.transitionToActive(0);
+
+ // Make sure conf has the relevant HA configs.
+ HATestUtil.setFailoverConfigurations(cluster, conf, "ha-nn-uri-0", 0);
+
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+ fs.mkdirs(new Path("/test1"));
+ fs.mkdirs(new Path("/test2"));
+
+ runFsck(conf);
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ runFsck(conf);
+ } finally {
+ if (fs != null) {
+ fs.close();
+ }
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
+
+ static void runFsck(Configuration conf) throws Exception {
+ ByteArrayOutputStream bStream = new ByteArrayOutputStream();
+ PrintStream out = new PrintStream(bStream, true);
+ int errCode = ToolRunner.run(new DFSck(conf, out),
+ new String[]{"/", "-files"});
+ String result = bStream.toString();
+ System.out.println("output from fsck:\n" + result);
+ Assert.assertEquals(0, errCode);
+ assertTrue(result.contains("/test1"));
+ assertTrue(result.contains("/test2"));
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java
new file mode 100644
index 0000000..cc85c83
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.io.IOUtils;
+import org.junit.Test;
+
+/**
+ * Make sure HA-related metrics are updated and reported appropriately.
+ */
+public class TestHAMetrics {
+
+ private static final Log LOG = LogFactory.getLog(TestHAMetrics.class);
+
+ @Test
+ public void testHAMetrics() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, Integer.MAX_VALUE);
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(1)
+ .build();
+ FileSystem fs = null;
+ try {
+ cluster.waitActive();
+
+ FSNamesystem nn0 = cluster.getNamesystem(0);
+ FSNamesystem nn1 = cluster.getNamesystem(1);
+
+ assertEquals(nn0.getHAState(), "standby");
+ assertTrue(0 < nn0.getMillisSinceLastLoadedEdits());
+ assertEquals(nn1.getHAState(), "standby");
+ assertTrue(0 < nn1.getMillisSinceLastLoadedEdits());
+
+ cluster.transitionToActive(0);
+
+ assertEquals("active", nn0.getHAState());
+ assertEquals(0, nn0.getMillisSinceLastLoadedEdits());
+ assertEquals("standby", nn1.getHAState());
+ assertTrue(0 < nn1.getMillisSinceLastLoadedEdits());
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ assertEquals("standby", nn0.getHAState());
+ assertTrue(0 < nn0.getMillisSinceLastLoadedEdits());
+ assertEquals("active", nn1.getHAState());
+ assertEquals(0, nn1.getMillisSinceLastLoadedEdits());
+
+ Thread.sleep(2000); // make sure standby gets a little out-of-date
+ assertTrue(2000 <= nn0.getMillisSinceLastLoadedEdits());
+
+ assertEquals(0, nn0.getPendingDataNodeMessageCount());
+ assertEquals(0, nn1.getPendingDataNodeMessageCount());
+
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+ DFSTestUtil.createFile(fs, new Path("/foo"),
+ 10, (short)1, 1L);
+
+ assertTrue(0 < nn0.getPendingDataNodeMessageCount());
+ assertEquals(0, nn1.getPendingDataNodeMessageCount());
+ long millisSinceLastLoadedEdits = nn0.getMillisSinceLastLoadedEdits();
+
+ HATestUtil.waitForStandbyToCatchUp(cluster.getNameNode(1),
+ cluster.getNameNode(0));
+
+ assertEquals(0, nn0.getPendingDataNodeMessageCount());
+ assertEquals(0, nn1.getPendingDataNodeMessageCount());
+ long newMillisSinceLastLoadedEdits = nn0.getMillisSinceLastLoadedEdits();
+ // Since we just waited for the standby to catch up, the time since we
+ // last loaded edits should be very low.
+ assertTrue("expected " + millisSinceLastLoadedEdits + " > " +
+ newMillisSinceLastLoadedEdits,
+ millisSinceLastLoadedEdits > newMillisSinceLastLoadedEdits);
+ } finally {
+ IOUtils.cleanup(LOG, fs);
+ cluster.shutdown();
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java
new file mode 100644
index 0000000..8790d0f
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java
@@ -0,0 +1,648 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Matchers.anyInt;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.base.Supplier;
+import com.google.common.collect.Lists;
+
+/**
+ * Tests that exercise safemode in an HA cluster.
+ */
+public class TestHASafeMode {
+ private static final Log LOG = LogFactory.getLog(TestHASafeMode.class);
+ private static final int BLOCK_SIZE = 1024;
+ private NameNode nn0;
+ private NameNode nn1;
+ private FileSystem fs;
+ private MiniDFSCluster cluster;
+ private Runtime mockRuntime = mock(Runtime.class);
+
+ static {
+ ((Log4JLogger)LogFactory.getLog(FSImage.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
+ }
+
+ @Before
+ public void setupCluster() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+ conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .waitSafeMode(false)
+ .build();
+ cluster.waitActive();
+
+ nn0 = cluster.getNameNode(0);
+ nn1 = cluster.getNameNode(1);
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ nn0.getNamesystem().getEditLogTailer().setRuntime(mockRuntime);
+
+ cluster.transitionToActive(0);
+ }
+
+ @After
+ public void shutdownCluster() throws IOException {
+ if (cluster != null) {
+ verify(mockRuntime, times(0)).exit(anyInt());
+ cluster.shutdown();
+ }
+ }
+
+ private void restartStandby() throws IOException {
+ cluster.shutdownNameNode(1);
+ // Set the safemode extension to be lengthy, so that the tests
+ // can check the safemode message after the safemode conditions
+ // have been achieved, without being racy.
+ cluster.getConfiguration(1).setInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000);
+ cluster.getConfiguration(1).setInt(
+ DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+
+ cluster.restartNameNode(1);
+ nn1 = cluster.getNameNode(1);
+ assertEquals(nn1.getNamesystem().getTransactionsSinceLastLogRoll(), 0L);
+ }
+
+ /**
+ * Test case for enter safemode in active namenode, when it is already in startup safemode.
+ * It is a regression test for HDFS-2747.
+ */
+ @Test
+ public void testEnterSafeModeInANNShouldNotThrowNPE() throws Exception {
+ banner("Restarting active");
+ DFSTestUtil
+ .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L);
+ restartActive();
+ nn0.getRpcServer().transitionToActive();
+
+ FSNamesystem namesystem = nn0.getNamesystem();
+ String status = namesystem.getSafemode();
+ assertTrue("Bad safemode status: '" + status + "'", status
+ .startsWith("Safe mode is ON."));
+ NameNodeAdapter.enterSafeMode(nn0, false);
+ assertTrue("Failed to enter into safemode in active", namesystem
+ .isInSafeMode());
+ NameNodeAdapter.enterSafeMode(nn0, false);
+ assertTrue("Failed to enter into safemode in active", namesystem
+ .isInSafeMode());
+ }
+
+ /**
+ * Test case for enter safemode in standby namenode, when it is already in startup safemode.
+ * It is a regression test for HDFS-2747.
+ */
+ @Test
+ public void testEnterSafeModeInSBNShouldNotThrowNPE() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil
+ .createFile(fs, new Path("/test"), 3 * BLOCK_SIZE, (short) 3, 1L);
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup and enter safemode.
+ nn0.getRpcServer().rollEditLog();
+ banner("Creating some blocks that won't be in the edit log");
+ DFSTestUtil.createFile(fs, new Path("/test2"), 5 * BLOCK_SIZE, (short) 3,
+ 1L);
+ banner("Deleting the original blocks");
+ fs.delete(new Path("/test"), true);
+ banner("Restarting standby");
+ restartStandby();
+ FSNamesystem namesystem = nn1.getNamesystem();
+ String status = namesystem.getSafemode();
+ assertTrue("Bad safemode status: '" + status + "'", status
+ .startsWith("Safe mode is ON."));
+ NameNodeAdapter.enterSafeMode(nn1, false);
+ assertTrue("Failed to enter into safemode in standby", namesystem
+ .isInSafeMode());
+ NameNodeAdapter.enterSafeMode(nn1, false);
+ assertTrue("Failed to enter into safemode in standby", namesystem
+ .isInSafeMode());
+ }
+
+ private void restartActive() throws IOException {
+ cluster.shutdownNameNode(0);
+ // Set the safemode extension to be lengthy, so that the tests
+ // can check the safemode message after the safemode conditions
+ // have been achieved, without being racy.
+ cluster.getConfiguration(0).setInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 30000);
+ cluster.restartNameNode(0);
+ nn0 = cluster.getNameNode(0);
+ }
+
+ /**
+ * Tests the case where, while a standby is down, more blocks are
+ * added to the namespace, but not rolled. So, when it starts up,
+ * it receives notification about the new blocks during
+ * the safemode extension period.
+ */
+ @Test
+ public void testBlocksAddedBeforeStandbyRestart() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Creating some blocks that won't be in the edit log");
+ DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Restarting standby");
+ restartStandby();
+
+ // We expect it not to be stuck in safemode, since those blocks
+ // that are already visible to the SBN should be processed
+ // in the initial block reports.
+ assertSafeMode(nn1, 3, 3);
+
+ banner("Waiting for standby to catch up to active namespace");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ assertSafeMode(nn1, 8, 8);
+ }
+
+ /**
+ * Similar to {@link #testBlocksAddedBeforeStandbyRestart()} except that
+ * the new blocks are allocated after the SBN has restarted. So, the
+ * blocks were not present in the original block reports at startup
+ * but are reported separately by blockReceived calls.
+ */
+ @Test
+ public void testBlocksAddedWhileInSafeMode() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Restarting standby");
+ restartStandby();
+
+ assertSafeMode(nn1, 3, 3);
+
+ // Create a few blocks which will send blockReceived calls to the
+ // SBN.
+ banner("Creating some blocks while SBN is in safe mode");
+ DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L);
+
+
+ banner("Waiting for standby to catch up to active namespace");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ assertSafeMode(nn1, 8, 8);
+ }
+
+ /**
+ * Test for the following case proposed by ATM:
+ * 1. Both NNs are up, one is active. There are 100 blocks. Both are
+ * out of safemode.
+ * 2. 10 block deletions get processed by NN1. NN2 enqueues these DN messages
+ * until it next reads from a checkpointed edits file.
+ * 3. NN2 gets restarted. Its queues are lost.
+ * 4. NN2 comes up, reads from all the finalized edits files. Concludes there
+ * should still be 100 blocks.
+ * 5. NN2 receives a block report from all the DNs, which only accounts for
+ * 90 blocks. It doesn't leave safemode.
+ * 6. NN1 dies or is transitioned to standby.
+ * 7. NN2 is transitioned to active. It reads all the edits from NN1. It now
+ * knows there should only be 90 blocks, but it's still in safemode.
+ * 8. NN2 doesn't ever recheck whether it should leave safemode.
+ *
+ * This is essentially the inverse of {@link #testBlocksAddedBeforeStandbyRestart()}
+ */
+ @Test
+ public void testBlocksRemovedBeforeStandbyRestart() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 5*BLOCK_SIZE, (short) 3, 1L);
+
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ // Delete those blocks again, so they won't get reported to the SBN
+ // once it starts up
+ banner("Removing the blocks without rolling the edit log");
+ fs.delete(new Path("/test"), true);
+ BlockManagerTestUtil.computeAllPendingWork(
+ nn0.getNamesystem().getBlockManager());
+ cluster.triggerHeartbeats();
+
+ banner("Restarting standby");
+ restartStandby();
+ assertSafeMode(nn1, 0, 5);
+
+ banner("Waiting for standby to catch up to active namespace");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ assertSafeMode(nn1, 0, 0);
+ }
+
+ /**
+ * Similar to {@link #testBlocksRemovedBeforeStandbyRestart()} except that
+ * the blocks are removed after the SBN has restarted. So, the
+ * blocks were present in the original block reports at startup
+ * but are deleted separately later by deletion reports.
+ */
+ @Test
+ public void testBlocksRemovedWhileInSafeMode() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L);
+
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Restarting standby");
+ restartStandby();
+
+ // It will initially have all of the blocks necessary.
+ assertSafeMode(nn1, 10, 10);
+
+ // Delete those blocks while the SBN is in safe mode.
+ // This doesn't affect the SBN, since deletions are not
+ // ACKed when due to block removals.
+ banner("Removing the blocks without rolling the edit log");
+ fs.delete(new Path("/test"), true);
+ BlockManagerTestUtil.computeAllPendingWork(
+ nn0.getNamesystem().getBlockManager());
+
+ banner("Triggering deletions on DNs and Deletion Reports");
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+
+ assertSafeMode(nn1, 10, 10);
+
+ // When we catch up to active namespace, it will restore back
+ // to 0 blocks.
+ banner("Waiting for standby to catch up to active namespace");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ assertSafeMode(nn1, 0, 0);
+ }
+
+ /**
+ * Tests that the standby node properly tracks the number of total
+ * and safe blocks while it is in safe mode. Since safe-mode only
+ * counts completed blocks, append needs to decrement the total
+ * number of blocks and then re-increment when the file is closed
+ * again.
+ */
+ @Test
+ public void testAppendWhileInSafeMode() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ // Make 4.5 blocks so that append() will re-open an existing block
+ // instead of just adding a new one
+ DFSTestUtil.createFile(fs, new Path("/test"),
+ 4*BLOCK_SIZE + BLOCK_SIZE/2, (short) 3, 1L);
+
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Restarting standby");
+ restartStandby();
+
+ // It will initially have all of the blocks necessary.
+ assertSafeMode(nn1, 5, 5);
+
+ // Append to a block while SBN is in safe mode. This should
+ // not affect safemode initially, since the DN message
+ // will get queued.
+ FSDataOutputStream stm = fs.append(new Path("/test"));
+ try {
+ assertSafeMode(nn1, 5, 5);
+
+ // if we roll edits now, the SBN should see that it's under construction
+ // and change its total count and safe count down by one, since UC
+ // blocks are not counted by safe mode.
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ assertSafeMode(nn1, 4, 4);
+ } finally {
+ IOUtils.closeStream(stm);
+ }
+
+ // Delete those blocks while the SBN is in safe mode.
+ // This will not ACK the deletions to the SBN, so it won't
+ // notice until we roll the edit log.
+ banner("Removing the blocks without rolling the edit log");
+ fs.delete(new Path("/test"), true);
+ BlockManagerTestUtil.computeAllPendingWork(
+ nn0.getNamesystem().getBlockManager());
+
+ banner("Triggering deletions on DNs and Deletion Reports");
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+
+ assertSafeMode(nn1, 4, 4);
+
+ // When we roll the edit log, the deletions will go through.
+ banner("Waiting for standby to catch up to active namespace");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ assertSafeMode(nn1, 0, 0);
+ }
+
+ /**
+ * Regression test for a bug experienced while developing
+ * HDFS-2742. The scenario here is:
+ * - image contains some blocks
+ * - edits log contains at least one block addition, followed
+ * by deletion of more blocks than were added.
+ * - When node starts up, some incorrect accounting of block
+ * totals caused an assertion failure.
+ */
+ @Test
+ public void testBlocksDeletedInEditLog() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ // Make 4 blocks persisted in the image.
+ DFSTestUtil.createFile(fs, new Path("/test"),
+ 4*BLOCK_SIZE, (short) 3, 1L);
+ NameNodeAdapter.enterSafeMode(nn0, false);
+ NameNodeAdapter.saveNamespace(nn0);
+ NameNodeAdapter.leaveSafeMode(nn0, false);
+
+ // OP_ADD for 2 blocks
+ DFSTestUtil.createFile(fs, new Path("/test2"),
+ 2*BLOCK_SIZE, (short) 3, 1L);
+
+ // OP_DELETE for 4 blocks
+ fs.delete(new Path("/test"), true);
+
+ restartActive();
+ }
+
+ private void assertSafeMode(NameNode nn, int safe, int total) {
+ String status = nn1.getNamesystem().getSafemode();
+ if (safe == total) {
+ assertTrue("Bad safemode status: '" + status + "'",
+ status.startsWith(
+ "Safe mode is ON." +
+ "The reported blocks " + safe + " has reached the threshold " +
+ "0.9990 of total blocks " + total + ". Safe mode will be " +
+ "turned off automatically"));
+ } else {
+ int additional = total - safe;
+ assertTrue("Bad safemode status: '" + status + "'",
+ status.startsWith(
+ "Safe mode is ON." +
+ "The reported blocks " + safe + " needs additional " +
+ additional + " blocks"));
+ }
+ }
+
+ /**
+ * Set up a namesystem with several edits, both deletions and
+ * additions, and failover to a new NN while that NN is in
+ * safemode. Ensure that it will exit safemode.
+ */
+ @Test
+ public void testComplexFailoverIntoSafemode() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup and enter safemode.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Creating some blocks that won't be in the edit log");
+ DFSTestUtil.createFile(fs, new Path("/test2"), 5*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Deleting the original blocks");
+ fs.delete(new Path("/test"), true);
+
+ banner("Restarting standby");
+ restartStandby();
+
+ // We expect it to be on its way out of safemode, since all of the blocks
+ // from the edit log have been reported.
+ assertSafeMode(nn1, 3, 3);
+
+ // Initiate a failover into it while it's in safemode
+ banner("Initiating a failover into NN1 in safemode");
+ NameNodeAdapter.abortEditLogs(nn0);
+ cluster.transitionToActive(1);
+
+ assertSafeMode(nn1, 5, 5);
+ }
+
+ /**
+ * Similar to {@link #testBlocksRemovedWhileInSafeMode()} except that
+ * the OP_DELETE edits arrive at the SBN before the block deletion reports.
+ * The tracking of safe blocks needs to properly account for the removal
+ * of the blocks as well as the safe count. This is a regression test for
+ * HDFS-2742.
+ */
+ @Test
+ public void testBlocksRemovedWhileInSafeModeEditsArriveFirst() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some blocks");
+ DFSTestUtil.createFile(fs, new Path("/test"), 10*BLOCK_SIZE, (short) 3, 1L);
+
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup.
+ nn0.getRpcServer().rollEditLog();
+
+ banner("Restarting standby");
+ restartStandby();
+
+ // It will initially have all of the blocks necessary.
+ String status = nn1.getNamesystem().getSafemode();
+ assertTrue("Bad safemode status: '" + status + "'",
+ status.startsWith(
+ "Safe mode is ON." +
+ "The reported blocks 10 has reached the threshold 0.9990 of " +
+ "total blocks 10. Safe mode will be turned off automatically"));
+
+ // Delete those blocks while the SBN is in safe mode.
+ // Immediately roll the edit log before the actual deletions are sent
+ // to the DNs.
+ banner("Removing the blocks without rolling the edit log");
+ fs.delete(new Path("/test"), true);
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // Should see removal of the blocks as well as their contribution to safe block count.
+ assertSafeMode(nn1, 0, 0);
+
+
+ banner("Triggering sending deletions to DNs and Deletion Reports");
+ BlockManagerTestUtil.computeAllPendingWork(
+ nn0.getNamesystem().getBlockManager());
+ cluster.triggerHeartbeats();
+ HATestUtil.waitForDNDeletions(cluster);
+ cluster.triggerDeletionReports();
+
+ // No change in assertion status here, but some of the consistency checks
+ // in safemode will fire here if we accidentally decrement safe block count
+ // below 0.
+ assertSafeMode(nn1, 0, 0);
+ }
+
+
+ /**
+ * Test that the number of safe blocks is accounted correctly even when
+ * blocks move between under-construction state and completed state.
+ * If a FINALIZED report arrives at the SBN before the block is marked
+ * COMPLETE, then when we get the OP_CLOSE we need to count it as "safe"
+ * at that point. This is a regression test for HDFS-2742.
+ */
+ @Test
+ public void testSafeBlockTracking() throws Exception {
+ banner("Starting with NN0 active and NN1 standby, creating some " +
+ "UC blocks plus some other blocks to force safemode");
+ DFSTestUtil.createFile(fs, new Path("/other-blocks"), 10*BLOCK_SIZE, (short) 3, 1L);
+
+ List<FSDataOutputStream> stms = Lists.newArrayList();
+ try {
+ for (int i = 0; i < 5; i++) {
+ FSDataOutputStream stm = fs.create(new Path("/test-uc-" + i));
+ stms.add(stm);
+ stm.write(1);
+ stm.hflush();
+ }
+ // Roll edit log so that, when the SBN restarts, it will load
+ // the namespace during startup and enter safemode.
+ nn0.getRpcServer().rollEditLog();
+ } finally {
+ for (FSDataOutputStream stm : stms) {
+ IOUtils.closeStream(stm);
+ }
+ }
+
+ banner("Restarting SBN");
+ restartStandby();
+ assertSafeMode(nn1, 10, 10);
+
+ banner("Allowing SBN to catch up");
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ assertSafeMode(nn1, 15, 15);
+ }
+
+ /**
+ * Regression test for HDFS-2753. In this bug, the following sequence was
+ * observed:
+ * - Some blocks are written to DNs while the SBN was down. This causes
+ * the blockReceived messages to get queued in the BPServiceActor on the
+ * DN.
+ * - When the SBN returns, the DN re-registers with the SBN, and then
+ * flushes its blockReceived queue to the SBN before it sends its
+ * first block report. This caused the first block report to be
+ * incorrect ignored.
+ * - The SBN would become stuck in safemode.
+ */
+ @Test
+ public void testBlocksAddedWhileStandbyIsDown() throws Exception {
+ DFSTestUtil.createFile(fs, new Path("/test"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Stopping standby");
+ cluster.shutdownNameNode(1);
+
+ DFSTestUtil.createFile(fs, new Path("/test2"), 3*BLOCK_SIZE, (short) 3, 1L);
+
+ banner("Rolling edit log so standby gets all edits on restart");
+ nn0.getRpcServer().rollEditLog();
+
+ restartStandby();
+ assertSafeMode(nn1, 6, 6);
+ }
+
+ /**
+ * Regression test for HDFS-2804: standby should not populate replication
+ * queues when exiting safe mode.
+ */
+ @Test
+ public void testNoPopulatingReplQueuesWhenExitingSafemode() throws Exception {
+ DFSTestUtil.createFile(fs, new Path("/test"), 15*BLOCK_SIZE, (short)3, 1L);
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // get some blocks in the SBN's image
+ nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_ENTER);
+ NameNodeAdapter.saveNamespace(nn1);
+ nn1.getRpcServer().setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
+
+ // and some blocks in the edit logs
+ DFSTestUtil.createFile(fs, new Path("/test2"), 15*BLOCK_SIZE, (short)3, 1L);
+ nn0.getRpcServer().rollEditLog();
+
+ cluster.stopDataNode(1);
+ cluster.shutdownNameNode(1);
+
+ //Configuration sbConf = cluster.getConfiguration(1);
+ //sbConf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 1);
+ cluster.restartNameNode(1, false);
+ nn1 = cluster.getNameNode(1);
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ return !nn1.isInSafeMode();
+ }
+ }, 100, 10000);
+
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ assertEquals(0L, nn1.getNamesystem().getUnderReplicatedBlocks());
+ assertEquals(0L, nn1.getNamesystem().getPendingReplicationBlocks());
+ }
+
+ /**
+ * Print a big banner in the test log to make debug easier.
+ */
+ static void banner(String string) {
+ LOG.info("\n\n\n\n================================================\n" +
+ string + "\n" +
+ "==================================================\n\n");
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java
new file mode 100644
index 0000000..092bb5a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAStateTransitions.java
@@ -0,0 +1,545 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
+import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
+import org.apache.log4j.Level;
+import org.junit.Assert;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+/**
+ * Tests state transition from active->standby, and manual failover
+ * and failback between two namenodes.
+ */
+public class TestHAStateTransitions {
+ protected static final Log LOG = LogFactory.getLog(
+ TestStandbyIsHot.class);
+ private static final Path TEST_DIR = new Path("/test");
+ private static final Path TEST_FILE_PATH = new Path(TEST_DIR, "foo");
+ private static final String TEST_FILE_STR = TEST_FILE_PATH.toUri().getPath();
+ private static final String TEST_FILE_DATA =
+ "Hello state transitioning world";
+
+ static {
+ ((Log4JLogger)EditLogTailer.LOG).getLogger().setLevel(Level.ALL);
+ }
+
+ /**
+ * Test which takes a single node and flip flops between
+ * active and standby mode, making sure it doesn't
+ * double-play any edits.
+ */
+ @Test
+ public void testTransitionActiveToStandby() throws Exception {
+ Configuration conf = new Configuration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ FileSystem fs = cluster.getFileSystem(0);
+
+ fs.mkdirs(TEST_DIR);
+ cluster.transitionToStandby(0);
+ try {
+ fs.mkdirs(new Path("/x"));
+ fail("Didn't throw trying to mutate FS in standby state");
+ } catch (Throwable t) {
+ GenericTestUtils.assertExceptionContains(
+ "Operation category WRITE is not supported", t);
+ }
+ cluster.transitionToActive(0);
+
+ // Create a file, then delete the whole directory recursively.
+ DFSTestUtil.createFile(fs, new Path(TEST_DIR, "foo"),
+ 10, (short)1, 1L);
+ fs.delete(TEST_DIR, true);
+
+ // Now if the standby tries to replay the last segment that it just
+ // wrote as active, it would fail since it's trying to create a file
+ // in a non-existent directory.
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(0);
+
+ assertFalse(fs.exists(TEST_DIR));
+
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test that transitioning a service to the state that it is already
+ * in is a nop, specifically, an exception is not thrown.
+ */
+ @Test
+ public void testTransitionToCurrentStateIsANop() throws Exception {
+ Configuration conf = new Configuration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ cluster.transitionToActive(0);
+ cluster.transitionToStandby(0);
+ cluster.transitionToStandby(0);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test manual failover failback for one namespace
+ * @param cluster single process test cluster
+ * @param conf cluster configuration
+ * @param nsIndex namespace index starting from zero
+ * @throws Exception
+ */
+ private void testManualFailoverFailback(MiniDFSCluster cluster,
+ Configuration conf, int nsIndex) throws Exception {
+ int nn0 = 2 * nsIndex, nn1 = 2 * nsIndex + 1;
+
+ cluster.transitionToActive(nn0);
+
+ LOG.info("Starting with NN 0 active in namespace " + nsIndex);
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ fs.mkdirs(TEST_DIR);
+
+ LOG.info("Failing over to NN 1 in namespace " + nsIndex);
+ cluster.transitionToStandby(nn0);
+ cluster.transitionToActive(nn1);
+ assertTrue(fs.exists(TEST_DIR));
+ DFSTestUtil.writeFile(fs, TEST_FILE_PATH, TEST_FILE_DATA);
+
+ LOG.info("Failing over to NN 0 in namespace " + nsIndex);
+ cluster.transitionToStandby(nn1);
+ cluster.transitionToActive(nn0);
+ assertTrue(fs.exists(TEST_DIR));
+ assertEquals(TEST_FILE_DATA,
+ DFSTestUtil.readFile(fs, TEST_FILE_PATH));
+
+ LOG.info("Removing test file");
+ fs.delete(TEST_DIR, true);
+ assertFalse(fs.exists(TEST_DIR));
+
+ LOG.info("Failing over to NN 1 in namespace " + nsIndex);
+ cluster.transitionToStandby(nn0);
+ cluster.transitionToActive(nn1);
+ assertFalse(fs.exists(TEST_DIR));
+ }
+
+ /**
+ * Tests manual failover back and forth between two NameNodes.
+ */
+ @Test
+ public void testManualFailoverAndFailback() throws Exception {
+ Configuration conf = new Configuration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .build();
+ try {
+ cluster.waitActive();
+ // test the only namespace
+ testManualFailoverFailback(cluster, conf, 0);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Regression test for HDFS-2693: when doing state transitions, we need to
+ * lock the FSNamesystem so that we don't end up doing any writes while it's
+ * "in between" states.
+ * This test case starts up several client threads which do mutation operations
+ * while flipping a NN back and forth from active to standby.
+ */
+ @Test(timeout=120000)
+ public void testTransitionSynchronization() throws Exception {
+ Configuration conf = new Configuration();
+ final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ try {
+ cluster.waitActive();
+ ReentrantReadWriteLock spyLock = NameNodeAdapter.spyOnFsLock(
+ cluster.getNameNode(0).getNamesystem());
+ Mockito.doAnswer(new GenericTestUtils.SleepAnswer(50))
+ .when(spyLock).writeLock();
+
+ final FileSystem fs = HATestUtil.configureFailoverFs(
+ cluster, conf);
+
+ TestContext ctx = new TestContext();
+ for (int i = 0; i < 50; i++) {
+ final int finalI = i;
+ ctx.addThread(new RepeatingTestThread(ctx) {
+ @Override
+ public void doAnAction() throws Exception {
+ Path p = new Path("/test-" + finalI);
+ fs.mkdirs(p);
+ fs.delete(p, true);
+ }
+ });
+ }
+
+ ctx.addThread(new RepeatingTestThread(ctx) {
+ @Override
+ public void doAnAction() throws Exception {
+ cluster.transitionToStandby(0);
+ Thread.sleep(50);
+ cluster.transitionToActive(0);
+ }
+ });
+ ctx.startThreads();
+ ctx.waitFor(20000);
+ ctx.stop();
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test for HDFS-2812. Since lease renewals go from the client
+ * only to the active NN, the SBN will have out-of-date lease
+ * info when it becomes active. We need to make sure we don't
+ * accidentally mark the leases as expired when the failover
+ * proceeds.
+ */
+ @Test(timeout=120000)
+ public void testLeasesRenewedOnTransition() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .build();
+ FSDataOutputStream stm = null;
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ NameNode nn0 = cluster.getNameNode(0);
+ NameNode nn1 = cluster.getNameNode(1);
+
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+
+ LOG.info("Starting with NN 0 active");
+
+ stm = fs.create(TEST_FILE_PATH);
+ long nn0t0 = NameNodeAdapter.getLeaseRenewalTime(nn0, TEST_FILE_STR);
+ assertTrue(nn0t0 > 0);
+ long nn1t0 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR);
+ assertEquals("Lease should not yet exist on nn1",
+ -1, nn1t0);
+
+ Thread.sleep(5); // make sure time advances!
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ long nn1t1 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR);
+ assertTrue("Lease should have been created on standby. Time was: " +
+ nn1t1, nn1t1 > nn0t0);
+
+ Thread.sleep(5); // make sure time advances!
+
+ LOG.info("Failing over to NN 1");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+ long nn1t2 = NameNodeAdapter.getLeaseRenewalTime(nn1, TEST_FILE_STR);
+ assertTrue("Lease should have been renewed by failover process",
+ nn1t2 > nn1t1);
+ } finally {
+ IOUtils.closeStream(stm);
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test that delegation tokens continue to work after the failover.
+ */
+ @Test
+ public void testDelegationTokensAfterFailover() throws IOException,
+ URISyntaxException {
+ Configuration conf = new Configuration();
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ NameNode nn1 = cluster.getNameNode(0);
+ NameNode nn2 = cluster.getNameNode(1);
+
+ String renewer = UserGroupInformation.getLoginUser().getUserName();
+ Token<DelegationTokenIdentifier> token = nn1.getRpcServer()
+ .getDelegationToken(new Text(renewer));
+
+ LOG.info("Failing over to NN 1");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ nn2.getRpcServer().renewDelegationToken(token);
+ nn2.getRpcServer().cancelDelegationToken(token);
+ token = nn2.getRpcServer().getDelegationToken(new Text(renewer));
+ Assert.assertTrue(token != null);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Tests manual failover back and forth between two NameNodes
+ * for federation cluster with two namespaces.
+ */
+ @Test
+ public void testManualFailoverFailbackFederationHA() throws Exception {
+ Configuration conf = new Configuration();
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHAFederatedTopology(2))
+ .numDataNodes(1)
+ .build();
+ try {
+ cluster.waitActive();
+
+ // test for namespace 0
+ testManualFailoverFailback(cluster, conf, 0);
+
+ // test for namespace 1
+ testManualFailoverFailback(cluster, conf, 1);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ @Test
+ public void testFailoverWithEmptyInProgressEditLog() throws Exception {
+ testFailoverAfterCrashDuringLogRoll(false);
+ }
+
+ @Test
+ public void testFailoverWithEmptyInProgressEditLogWithHeader()
+ throws Exception {
+ testFailoverAfterCrashDuringLogRoll(true);
+ }
+
+ private static void testFailoverAfterCrashDuringLogRoll(boolean writeHeader)
+ throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, Integer.MAX_VALUE);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(0)
+ .build();
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ try {
+ cluster.transitionToActive(0);
+ NameNode nn0 = cluster.getNameNode(0);
+ nn0.getRpcServer().rollEditLog();
+ cluster.shutdownNameNode(0);
+ createEmptyInProgressEditLog(cluster, nn0, writeHeader);
+ cluster.transitionToActive(1);
+ } finally {
+ IOUtils.cleanup(LOG, fs);
+ cluster.shutdown();
+ }
+ }
+
+ private static void createEmptyInProgressEditLog(MiniDFSCluster cluster,
+ NameNode nn, boolean writeHeader) throws IOException {
+ long txid = nn.getNamesystem().getEditLog().getLastWrittenTxId();
+ URI sharedEditsUri = cluster.getSharedEditsDir(0, 1);
+ File sharedEditsDir = new File(sharedEditsUri.getPath());
+ StorageDirectory storageDir = new StorageDirectory(sharedEditsDir);
+ File inProgressFile = NameNodeAdapter.getInProgressEditsFile(storageDir,
+ txid + 1);
+ assertTrue("Failed to create in-progress edits file",
+ inProgressFile.createNewFile());
+
+ if (writeHeader) {
+ DataOutputStream out = new DataOutputStream(new FileOutputStream(
+ inProgressFile));
+ EditLogFileOutputStream.writeHeader(out);
+ out.close();
+ }
+ }
+
+
+ /**
+ * The secret manager needs to start/stop - the invariant should be that
+ * the secret manager runs if and only if the NN is active and not in
+ * safe mode. As a state diagram, we need to test all of the following
+ * transitions to make sure the secret manager is started when we transition
+ * into state 4, but none of the others.
+ * <pre>
+ * SafeMode Not SafeMode
+ * Standby 1 <------> 2
+ * ^ ^
+ * | |
+ * v v
+ * Active 3 <------> 4
+ * </pre>
+ */
+ @Test(timeout=60000)
+ public void testSecretManagerState() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setBoolean(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
+ conf.setInt(
+ DFSConfigKeys.DFS_NAMENODE_DELEGATION_KEY_UPDATE_INTERVAL_KEY, 50);
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .waitSafeMode(false)
+ .build();
+ try {
+ cluster.transitionToActive(0);
+ DFSTestUtil.createFile(cluster.getFileSystem(0),
+ TEST_FILE_PATH, 6000, (short)1, 1L);
+
+ cluster.getConfiguration(0).setInt(
+ DFSConfigKeys.DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, 60000);
+
+ cluster.restartNameNode(0);
+ NameNode nn = cluster.getNameNode(0);
+
+ banner("Started in state 1.");
+ assertTrue(nn.isStandbyState());
+ assertTrue(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 1->2. Should not start secret manager");
+ NameNodeAdapter.leaveSafeMode(nn, false);
+ assertTrue(nn.isStandbyState());
+ assertFalse(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 2->1. Should not start secret manager.");
+ NameNodeAdapter.enterSafeMode(nn, false);
+ assertTrue(nn.isStandbyState());
+ assertTrue(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 1->3. Should not start secret manager.");
+ nn.getRpcServer().transitionToActive();
+ assertFalse(nn.isStandbyState());
+ assertTrue(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 3->1. Should not start secret manager.");
+ nn.getRpcServer().transitionToStandby();
+ assertTrue(nn.isStandbyState());
+ assertTrue(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 1->3->4. Should start secret manager.");
+ nn.getRpcServer().transitionToActive();
+ NameNodeAdapter.leaveSafeMode(nn, false);
+ assertFalse(nn.isStandbyState());
+ assertFalse(nn.isInSafeMode());
+ assertTrue(isDTRunning(nn));
+
+ banner("Transition 4->3. Should stop secret manager");
+ NameNodeAdapter.enterSafeMode(nn, false);
+ assertFalse(nn.isStandbyState());
+ assertTrue(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 3->4. Should start secret manager");
+ NameNodeAdapter.leaveSafeMode(nn, false);
+ assertFalse(nn.isStandbyState());
+ assertFalse(nn.isInSafeMode());
+ assertTrue(isDTRunning(nn));
+
+ for (int i = 0; i < 20; i++) {
+ // Loop the last check to suss out races.
+ banner("Transition 4->2. Should stop secret manager.");
+ nn.getRpcServer().transitionToStandby();
+ assertTrue(nn.isStandbyState());
+ assertFalse(nn.isInSafeMode());
+ assertFalse(isDTRunning(nn));
+
+ banner("Transition 2->4. Should start secret manager");
+ nn.getRpcServer().transitionToActive();
+ assertFalse(nn.isStandbyState());
+ assertFalse(nn.isInSafeMode());
+ assertTrue(isDTRunning(nn));
+ }
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ private boolean isDTRunning(NameNode nn) {
+ return NameNodeAdapter.getDtSecretManager(nn.getNamesystem()).isRunning();
+ }
+
+ /**
+ * Print a big banner in the test log to make debug easier.
+ */
+ static void banner(String string) {
+ LOG.info("\n\n\n\n================================================\n" +
+ string + "\n" +
+ "==================================================\n\n");
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java
new file mode 100644
index 0000000..be01430
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAWebUI.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.junit.Test;
+
+public class TestHAWebUI {
+
+ /**
+ * Tests that the web UI of the name node provides a link to browse the file
+ * system and summary of under-replicated blocks only in active state
+ *
+ */
+ @Test
+ public void testLinkAndClusterSummary() throws Exception {
+ Configuration conf = new Configuration();
+
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(0)
+ .build();
+ try {
+ cluster.waitActive();
+
+ cluster.transitionToActive(0);
+ String pageContents = DFSTestUtil.urlGet(new URL("http://localhost:"
+ + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort()
+ + "/dfshealth.jsp"));
+ assertTrue(pageContents.contains("Browse the filesystem"));
+ assertTrue(pageContents.contains("Number of Under-Replicated Blocks"));
+
+ cluster.transitionToStandby(0);
+ pageContents = DFSTestUtil.urlGet(new URL("http://localhost:"
+ + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort()
+ + "/dfshealth.jsp"));
+ assertFalse(pageContents.contains("Browse the filesystem"));
+ assertFalse(pageContents.contains("Number of Under-Replicated Blocks"));
+
+ cluster.transitionToActive(0);
+ pageContents = DFSTestUtil.urlGet(new URL("http://localhost:"
+ + NameNode.getHttpAddress(cluster.getConfiguration(0)).getPort()
+ + "/dfshealth.jsp"));
+ assertTrue(pageContents.contains("Browse the filesystem"));
+ assertTrue(pageContents.contains("Number of Under-Replicated Blocks"));
+
+ } finally {
+ cluster.shutdown();
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java
new file mode 100644
index 0000000..ab2a8dd
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ha.HealthCheckFailedException;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeResourceChecker;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+public class TestNNHealthCheck {
+
+ @Test
+ public void testNNHealthCheck() throws IOException {
+ MiniDFSCluster cluster = null;
+ try {
+ Configuration conf = new Configuration();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .numDataNodes(0)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .build();
+
+ NameNodeResourceChecker mockResourceChecker = Mockito.mock(
+ NameNodeResourceChecker.class);
+ Mockito.doReturn(true).when(mockResourceChecker).hasAvailableDiskSpace();
+ cluster.getNameNode(0).getNamesystem()
+ .setNNResourceChecker(mockResourceChecker);
+
+ NamenodeProtocols rpc = cluster.getNameNodeRpc(0);
+
+ // Should not throw error, which indicates healthy.
+ rpc.monitorHealth();
+
+ Mockito.doReturn(false).when(mockResourceChecker).hasAvailableDiskSpace();
+
+ try {
+ // Should throw error - NN is unhealthy.
+ rpc.monitorHealth();
+ fail("Should not have succeeded in calling monitorHealth");
+ } catch (HealthCheckFailedException hcfe) {
+ GenericTestUtils.assertExceptionContains(
+ "The NameNode has no resources available", hcfe);
+ }
+ } finally {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java
new file mode 100644
index 0000000..547ba72
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestPipelinesFailover.java
@@ -0,0 +1,506 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.AppendTestUtil;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
+import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
+import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
+import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
+
+import org.apache.log4j.Level;
+
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.base.Supplier;
+
+/**
+ * Test cases regarding pipeline recovery during NN failover.
+ */
+public class TestPipelinesFailover {
+ static {
+ ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)LogFactory.getLog(
+ "org.apache.hadoop.io.retry.RetryInvocationHandler")).getLogger().setLevel(Level.ALL);
+
+ ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
+ }
+
+ protected static final Log LOG = LogFactory.getLog(
+ TestPipelinesFailover.class);
+ private static final Path TEST_PATH =
+ new Path("/test-file");
+ private static final int BLOCK_SIZE = 4096;
+ private static final int BLOCK_AND_A_HALF = BLOCK_SIZE * 3 / 2;
+
+ private static final int STRESS_NUM_THREADS = 25;
+ private static final int STRESS_RUNTIME = 40000;
+
+ /**
+ * Tests continuing a write pipeline over a failover.
+ */
+ @Test(timeout=30000)
+ public void testWriteOverFailover() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+ // Don't check replication periodically.
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1000);
+
+ FSDataOutputStream stm = null;
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ Thread.sleep(500);
+
+ LOG.info("Starting with NN 0 active");
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ stm = fs.create(TEST_PATH);
+
+ // write a block and a half
+ AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);
+
+ // Make sure all of the blocks are written out before failover.
+ stm.hflush();
+
+ LOG.info("Failing over to NN 1");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ assertTrue(fs.exists(TEST_PATH));
+ FSNamesystem ns1 = cluster.getNameNode(1).getNamesystem();
+ BlockManagerTestUtil.updateState(ns1.getBlockManager());
+ assertEquals(0, ns1.getPendingReplicationBlocks());
+ assertEquals(0, ns1.getCorruptReplicaBlocks());
+ assertEquals(0, ns1.getMissingBlocksCount());
+
+ // write another block and a half
+ AppendTestUtil.write(stm, BLOCK_AND_A_HALF, BLOCK_AND_A_HALF);
+
+ stm.close();
+ stm = null;
+
+ AppendTestUtil.check(fs, TEST_PATH, BLOCK_SIZE * 3);
+ } finally {
+ IOUtils.closeStream(stm);
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Tests continuing a write pipeline over a failover when a DN fails
+ * after the failover - ensures that updating the pipeline succeeds
+ * even when the pipeline was constructed on a different NN.
+ */
+ @Test(timeout=30000)
+ public void testWriteOverFailoverWithDnFail() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+
+ FSDataOutputStream stm = null;
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(5)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ Thread.sleep(500);
+
+ LOG.info("Starting with NN 0 active");
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ stm = fs.create(TEST_PATH);
+
+ // write a block and a half
+ AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);
+
+ // Make sure all the blocks are written before failover
+ stm.hflush();
+
+ LOG.info("Failing over to NN 1");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ assertTrue(fs.exists(TEST_PATH));
+
+ cluster.stopDataNode(0);
+
+ // write another block and a half
+ AppendTestUtil.write(stm, BLOCK_AND_A_HALF, BLOCK_AND_A_HALF);
+ stm.hflush();
+
+ LOG.info("Failing back to NN 0");
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ cluster.stopDataNode(1);
+
+ AppendTestUtil.write(stm, BLOCK_AND_A_HALF*2, BLOCK_AND_A_HALF);
+ stm.hflush();
+
+
+ stm.close();
+ stm = null;
+
+ AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF * 3);
+ } finally {
+ IOUtils.closeStream(stm);
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Tests lease recovery if a client crashes. This approximates the
+ * use case of HBase WALs being recovered after a NN failover.
+ */
+ @Test(timeout=30000)
+ public void testLeaseRecoveryAfterFailover() throws Exception {
+ final Configuration conf = new Configuration();
+ // Disable permissions so that another user can recover the lease.
+ conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+
+ FSDataOutputStream stm = null;
+ final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ Thread.sleep(500);
+
+ LOG.info("Starting with NN 0 active");
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ stm = fs.create(TEST_PATH);
+
+ // write a block and a half
+ AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);
+ stm.hflush();
+
+ LOG.info("Failing over to NN 1");
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ assertTrue(fs.exists(TEST_PATH));
+
+ FileSystem fsOtherUser = createFsAsOtherUser(cluster, conf);
+ loopRecoverLease(fsOtherUser, TEST_PATH);
+
+ AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF);
+
+ // Fail back to ensure that the block locations weren't lost on the
+ // original node.
+ cluster.transitionToStandby(1);
+ cluster.transitionToActive(0);
+ AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF);
+ } finally {
+ IOUtils.closeStream(stm);
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test the scenario where the NN fails over after issuing a block
+ * synchronization request, but before it is committed. The
+ * DN running the recovery should then fail to commit the synchronization
+ * and a later retry will succeed.
+ */
+ @Test(timeout=30000)
+ public void testFailoverRightBeforeCommitSynchronization() throws Exception {
+ final Configuration conf = new Configuration();
+ // Disable permissions so that another user can recover the lease.
+ conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+
+ FSDataOutputStream stm = null;
+ final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+ Thread.sleep(500);
+
+ LOG.info("Starting with NN 0 active");
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+ stm = fs.create(TEST_PATH);
+
+ // write a half block
+ AppendTestUtil.write(stm, 0, BLOCK_SIZE / 2);
+ stm.hflush();
+
+ // Look into the block manager on the active node for the block
+ // under construction.
+
+ NameNode nn0 = cluster.getNameNode(0);
+ ExtendedBlock blk = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
+ DatanodeDescriptor expectedPrimary = getExpectedPrimaryNode(nn0, blk);
+ LOG.info("Expecting block recovery to be triggered on DN " +
+ expectedPrimary);
+
+ // Find the corresponding DN daemon, and spy on its connection to the
+ // active.
+ DataNode primaryDN = cluster.getDataNode(expectedPrimary.getIpcPort());
+ DatanodeProtocolClientSideTranslatorPB nnSpy =
+ DataNodeAdapter.spyOnBposToNN(primaryDN, nn0);
+
+ // Delay the commitBlockSynchronization call
+ DelayAnswer delayer = new DelayAnswer(LOG);
+ Mockito.doAnswer(delayer).when(nnSpy).commitBlockSynchronization(
+ Mockito.eq(blk),
+ Mockito.anyInt(), // new genstamp
+ Mockito.anyLong(), // new length
+ Mockito.eq(true), // close file
+ Mockito.eq(false), // delete block
+ (DatanodeID[]) Mockito.anyObject()); // new targets
+
+ DistributedFileSystem fsOtherUser = createFsAsOtherUser(cluster, conf);
+ assertFalse(fsOtherUser.recoverLease(TEST_PATH));
+
+ LOG.info("Waiting for commitBlockSynchronization call from primary");
+ delayer.waitForCall();
+
+ LOG.info("Failing over to NN 1");
+
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+
+ // Let the commitBlockSynchronization call go through, and check that
+ // it failed with the correct exception.
+ delayer.proceed();
+ delayer.waitForResult();
+ Throwable t = delayer.getThrown();
+ if (t == null) {
+ fail("commitBlockSynchronization call did not fail on standby");
+ }
+ GenericTestUtils.assertExceptionContains(
+ "Operation category WRITE is not supported",
+ t);
+
+ // Now, if we try again to recover the block, it should succeed on the new
+ // active.
+ loopRecoverLease(fsOtherUser, TEST_PATH);
+
+ AppendTestUtil.check(fs, TEST_PATH, BLOCK_SIZE/2);
+ } finally {
+ IOUtils.closeStream(stm);
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Stress test for pipeline/lease recovery. Starts a number of
+ * threads, each of which creates a file and has another client
+ * break the lease. While these threads run, failover proceeds
+ * back and forth between two namenodes.
+ */
+ @Test(timeout=STRESS_RUNTIME*3)
+ public void testPipelineRecoveryStress() throws Exception {
+ HAStressTestHarness harness = new HAStressTestHarness();
+ // Disable permissions so that another user can recover the lease.
+ harness.conf.setBoolean(
+ DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
+
+ final MiniDFSCluster cluster = harness.startCluster();
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+
+ FileSystem fs = harness.getFailoverFs();
+ DistributedFileSystem fsAsOtherUser = createFsAsOtherUser(
+ cluster, harness.conf);
+
+ TestContext testers = new TestContext();
+ for (int i = 0; i < STRESS_NUM_THREADS; i++) {
+ Path p = new Path("/test-" + i);
+ testers.addThread(new PipelineTestThread(
+ testers, fs, fsAsOtherUser, p));
+ }
+
+ // Start a separate thread which will make sure that replication
+ // happens quickly by triggering deletion reports and replication
+ // work calculation frequently.
+ harness.addReplicationTriggerThread(500);
+ harness.addFailoverThread(5000);
+ harness.startThreads();
+ testers.startThreads();
+
+ testers.waitFor(STRESS_RUNTIME);
+ testers.stop();
+ harness.stopThreads();
+ } finally {
+ System.err.println("===========================\n\n\n\n");
+ harness.shutdown();
+ }
+ }
+
+ /**
+ * Test thread which creates a file, has another fake user recover
+ * the lease on the file, and then ensures that the file's contents
+ * are properly readable. If any of these steps fails, propagates
+ * an exception back to the test context, causing the test case
+ * to fail.
+ */
+ private static class PipelineTestThread extends RepeatingTestThread {
+ private final FileSystem fs;
+ private final FileSystem fsOtherUser;
+ private final Path path;
+
+
+ public PipelineTestThread(TestContext ctx,
+ FileSystem fs, FileSystem fsOtherUser, Path p) {
+ super(ctx);
+ this.fs = fs;
+ this.fsOtherUser = fsOtherUser;
+ this.path = p;
+ }
+
+ @Override
+ public void doAnAction() throws Exception {
+ FSDataOutputStream stm = fs.create(path, true);
+ try {
+ AppendTestUtil.write(stm, 0, 100);
+ stm.hflush();
+ loopRecoverLease(fsOtherUser, path);
+ AppendTestUtil.check(fs, path, 100);
+ } finally {
+ try {
+ stm.close();
+ } catch (IOException e) {
+ // should expect this since we lost the lease
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "Pipeline test thread for " + path;
+ }
+ }
+
+
+
+ /**
+ * @return the node which is expected to run the recovery of the
+ * given block, which is known to be under construction inside the
+ * given NameNOde.
+ */
+ private DatanodeDescriptor getExpectedPrimaryNode(NameNode nn,
+ ExtendedBlock blk) {
+ BlockManager bm0 = nn.getNamesystem().getBlockManager();
+ BlockInfo storedBlock = bm0.getStoredBlock(blk.getLocalBlock());
+ assertTrue("Block " + blk + " should be under construction, " +
+ "got: " + storedBlock,
+ storedBlock instanceof BlockInfoUnderConstruction);
+ BlockInfoUnderConstruction ucBlock =
+ (BlockInfoUnderConstruction)storedBlock;
+ // We expect that the first indexed replica will be the one
+ // to be in charge of the synchronization / recovery protocol.
+ DatanodeDescriptor expectedPrimary = ucBlock.getExpectedLocations()[0];
+ return expectedPrimary;
+ }
+
+ private DistributedFileSystem createFsAsOtherUser(
+ final MiniDFSCluster cluster, final Configuration conf)
+ throws IOException, InterruptedException {
+ return (DistributedFileSystem) UserGroupInformation.createUserForTesting(
+ "otheruser", new String[] { "othergroup"})
+ .doAs(new PrivilegedExceptionAction<FileSystem>() {
+ @Override
+ public FileSystem run() throws Exception {
+ return HATestUtil.configureFailoverFs(
+ cluster, conf);
+ }
+ });
+ }
+
+ /**
+ * Try to cover the lease on the given file for up to 30
+ * seconds.
+ * @param fsOtherUser the filesystem to use for the recoverLease call
+ * @param testPath the path on which to run lease recovery
+ * @throws TimeoutException if lease recover does not succeed within 30
+ * seconds
+ * @throws InterruptedException if the thread is interrupted
+ */
+ private static void loopRecoverLease(
+ final FileSystem fsOtherUser, final Path testPath)
+ throws TimeoutException, InterruptedException {
+ try {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+ @Override
+ public Boolean get() {
+ boolean success;
+ try {
+ success = ((DistributedFileSystem)fsOtherUser)
+ .recoverLease(testPath);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ if (!success) {
+ LOG.info("Waiting to recover lease successfully");
+ }
+ return success;
+ }
+ }, 1000, 30000);
+ } catch (TimeoutException e) {
+ throw new TimeoutException("Timed out recovering lease for " +
+ testPath);
+ }
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java
new file mode 100644
index 0000000..5800d3a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestQuotasWithHA.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.DistributedFileSystem;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.io.IOUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestQuotasWithHA {
+ private static final Path TEST_DIR = new Path("/test");
+ private static final Path TEST_FILE = new Path(TEST_DIR, "file");
+ private static final String TEST_DIR_STR = TEST_DIR.toUri().getPath();
+
+ private static final long NS_QUOTA = 10000;
+ private static final long DS_QUOTA = 10000;
+ private static final long BLOCK_SIZE = 1024; // 1KB blocks
+
+ private MiniDFSCluster cluster;
+ private NameNode nn0;
+ private NameNode nn1;
+ private FileSystem fs;
+
+ @Before
+ public void setupCluster() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+ HAUtil.setAllowStandbyReads(conf, true);
+
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .waitSafeMode(false)
+ .build();
+ cluster.waitActive();
+
+ nn0 = cluster.getNameNode(0);
+ nn1 = cluster.getNameNode(1);
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ cluster.transitionToActive(0);
+ }
+
+ @After
+ public void shutdownCluster() throws IOException {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Test that quotas are properly tracked by the standby through
+ * create, append, delete.
+ */
+ @Test(timeout=60000)
+ public void testQuotasTrackedOnStandby() throws Exception {
+ fs.mkdirs(TEST_DIR);
+ DistributedFileSystem dfs = (DistributedFileSystem)fs;
+ dfs.setQuota(TEST_DIR, NS_QUOTA, DS_QUOTA);
+ long expectedSize = 3 * BLOCK_SIZE + BLOCK_SIZE/2;
+ DFSTestUtil.createFile(fs, TEST_FILE, expectedSize, (short)1, 1L);
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ ContentSummary cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR);
+ assertEquals(NS_QUOTA, cs.getQuota());
+ assertEquals(DS_QUOTA, cs.getSpaceQuota());
+ assertEquals(expectedSize, cs.getSpaceConsumed());
+ assertEquals(1, cs.getDirectoryCount());
+ assertEquals(1, cs.getFileCount());
+
+ // Append to the file and make sure quota is updated correctly.
+ FSDataOutputStream stm = fs.append(TEST_FILE);
+ try {
+ byte[] data = new byte[(int) (BLOCK_SIZE * 3 / 2)];
+ stm.write(data);
+ expectedSize += data.length;
+ } finally {
+ IOUtils.closeStream(stm);
+ }
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR);
+ assertEquals(NS_QUOTA, cs.getQuota());
+ assertEquals(DS_QUOTA, cs.getSpaceQuota());
+ assertEquals(expectedSize, cs.getSpaceConsumed());
+ assertEquals(1, cs.getDirectoryCount());
+ assertEquals(1, cs.getFileCount());
+
+
+ fs.delete(TEST_FILE, true);
+ expectedSize = 0;
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ cs = nn1.getRpcServer().getContentSummary(TEST_DIR_STR);
+ assertEquals(NS_QUOTA, cs.getQuota());
+ assertEquals(DS_QUOTA, cs.getSpaceQuota());
+ assertEquals(expectedSize, cs.getSpaceConsumed());
+ assertEquals(1, cs.getDirectoryCount());
+ assertEquals(0, cs.getFileCount());
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java
new file mode 100644
index 0000000..5440c38c
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.FSImage;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTestUtil;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+
+
+public class TestStandbyCheckpoints {
+ private static final int NUM_DIRS_IN_LOG = 200000;
+ private MiniDFSCluster cluster;
+ private NameNode nn0, nn1;
+ private FileSystem fs;
+
+ @Before
+ public void setupCluster() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, 1);
+ conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 5);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+
+ MiniDFSNNTopology topology = new MiniDFSNNTopology()
+ .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
+ .addNN(new MiniDFSNNTopology.NNConf("nn1").setHttpPort(10001))
+ .addNN(new MiniDFSNNTopology.NNConf("nn2").setHttpPort(10002)));
+
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(topology)
+ .numDataNodes(0)
+ .build();
+ cluster.waitActive();
+
+ nn0 = cluster.getNameNode(0);
+ nn1 = cluster.getNameNode(1);
+ fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ cluster.transitionToActive(0);
+ }
+
+ @After
+ public void shutdownCluster() throws IOException {
+ if (cluster != null) {
+ cluster.shutdown();
+ }
+ }
+
+ @Test
+ public void testSBNCheckpoints() throws Exception {
+ doEdits(0, 10);
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ // Once the standby catches up, it should notice that it needs to
+ // do a checkpoint and save one to its local directories.
+ HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 12));
+
+ // It should also upload it back to the active.
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 12));
+ }
+
+ /**
+ * Test for the case when both of the NNs in the cluster are
+ * in the standby state, and thus are both creating checkpoints
+ * and uploading them to each other.
+ * In this circumstance, they should receive the error from the
+ * other node indicating that the other node already has a
+ * checkpoint for the given txid, but this should not cause
+ * an abort, etc.
+ */
+ @Test
+ public void testBothNodesInStandbyState() throws Exception {
+ doEdits(0, 10);
+
+ cluster.transitionToStandby(0);
+
+ // Transitioning to standby closed the edit log on the active,
+ // so the standby will catch up. Then, both will be in standby mode
+ // with enough uncheckpointed txns to cause a checkpoint, and they
+ // will each try to take a checkpoint and upload to each other.
+ HATestUtil.waitForCheckpoint(cluster, 1, ImmutableList.of(0, 12));
+ HATestUtil.waitForCheckpoint(cluster, 0, ImmutableList.of(0, 12));
+
+ assertEquals(12, nn0.getNamesystem().getFSImage()
+ .getMostRecentCheckpointTxId());
+ assertEquals(12, nn1.getNamesystem().getFSImage()
+ .getMostRecentCheckpointTxId());
+
+ List<File> dirs = Lists.newArrayList();
+ dirs.addAll(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 0));
+ dirs.addAll(FSImageTestUtil.getNameNodeCurrentDirs(cluster, 1));
+ FSImageTestUtil.assertParallelFilesAreIdentical(dirs, ImmutableSet.<String>of());
+ }
+
+ /**
+ * Test for the case when the SBN is configured to checkpoint based
+ * on a time period, but no transactions are happening on the
+ * active. Thus, it would want to save a second checkpoint at the
+ * same txid, which is a no-op. This test makes sure this doesn't
+ * cause any problem.
+ */
+ @Test
+ public void testCheckpointWhenNoNewTransactionsHappened()
+ throws Exception {
+ // Checkpoint as fast as we can, in a tight loop.
+ cluster.getConfiguration(1).setInt(
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0);
+ cluster.restartNameNode(1);
+ nn1 = cluster.getNameNode(1);
+
+ FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1);
+
+ // We shouldn't save any checkpoints at txid=0
+ Thread.sleep(1000);
+ Mockito.verify(spyImage1, Mockito.never())
+ .saveNamespace((FSNamesystem) Mockito.anyObject());
+
+ // Roll the primary and wait for the standby to catch up
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+ Thread.sleep(2000);
+
+ // We should make exactly one checkpoint at this new txid.
+ Mockito.verify(spyImage1, Mockito.times(1))
+ .saveNamespace((FSNamesystem) Mockito.anyObject());
+ }
+
+ /**
+ * Test cancellation of ongoing checkpoints when failover happens
+ * mid-checkpoint.
+ */
+ @Test
+ public void testCheckpointCancellation() throws Exception {
+ cluster.transitionToStandby(0);
+
+ // Create an edit log in the shared edits dir with a lot
+ // of mkdirs operations. This is solely so that the image is
+ // large enough to take a non-trivial amount of time to load.
+ // (only ~15MB)
+ URI sharedUri = cluster.getSharedEditsDir(0, 1);
+ File sharedDir = new File(sharedUri.getPath(), "current");
+ File tmpDir = new File(MiniDFSCluster.getBaseDirectory(),
+ "testCheckpointCancellation-tmp");
+ FSImageTestUtil.createAbortedLogWithMkdirs(tmpDir, NUM_DIRS_IN_LOG,
+ 3);
+ String fname = NNStorage.getInProgressEditsFileName(3);
+ new File(tmpDir, fname).renameTo(new File(sharedDir, fname));
+
+ // Checkpoint as fast as we can, in a tight loop.
+ cluster.getConfiguration(1).setInt(
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 0);
+ cluster.restartNameNode(1);
+ nn1 = cluster.getNameNode(1);
+
+ cluster.transitionToActive(0);
+
+ for (int i = 0; i < 10; i++) {
+
+ doEdits(i*10, i*10 + 10);
+ cluster.transitionToStandby(0);
+ cluster.transitionToActive(1);
+ cluster.transitionToStandby(1);
+ cluster.transitionToActive(0);
+ }
+
+ assertTrue(StandbyCheckpointer.getCanceledCount() > 0);
+ }
+
+ private void doEdits(int start, int stop) throws IOException {
+ for (int i = start; i < stop; i++) {
+ Path p = new Path("/test" + i);
+ fs.mkdirs(p);
+ }
+ }
+
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java
new file mode 100644
index 0000000..ce5814b
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyIsHot.java
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Matchers.anyInt;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.impl.Log4JLogger;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
+import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
+import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
+import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.google.common.base.Supplier;
+
+/**
+ * The hotornot.com of unit tests: makes sure that the standby not only
+ * has namespace information, but also has the correct block reports, etc.
+ */
+public class TestStandbyIsHot {
+ protected static final Log LOG = LogFactory.getLog(
+ TestStandbyIsHot.class);
+ private static final String TEST_FILE_DATA = "hello highly available world";
+ private static final String TEST_FILE = "/testStandbyIsHot";
+ private static final Path TEST_FILE_PATH = new Path(TEST_FILE);
+
+ static {
+ ((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL);
+ ((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
+ }
+
+ @Test
+ public void testStandbyIsHot() throws Exception {
+ Configuration conf = new Configuration();
+ // We read from the standby to watch block locations
+ HAUtil.setAllowStandbyReads(conf, true);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(3)
+ .build();
+ Runtime mockRuntime = mock(Runtime.class);
+ try {
+ cluster.waitActive();
+ cluster.transitionToActive(0);
+
+ NameNode nn1 = cluster.getNameNode(0);
+ NameNode nn2 = cluster.getNameNode(1);
+
+ nn2.getNamesystem().getEditLogTailer().setRuntime(mockRuntime);
+
+ FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+
+ Thread.sleep(1000);
+ System.err.println("==================================");
+ DFSTestUtil.writeFile(fs, TEST_FILE_PATH, TEST_FILE_DATA);
+ // Have to force an edit log roll so that the standby catches up
+ nn1.getRpcServer().rollEditLog();
+ System.err.println("==================================");
+
+ // Block locations should show up on standby.
+ LOG.info("Waiting for block locations to appear on standby node");
+ waitForBlockLocations(cluster, nn2, TEST_FILE, 3);
+
+ // Trigger immediate heartbeats and block reports so
+ // that the active "trusts" all of the DNs
+ cluster.triggerHeartbeats();
+ cluster.triggerBlockReports();
+
+ // Change replication
+ LOG.info("Changing replication to 1");
+ fs.setReplication(TEST_FILE_PATH, (short)1);
+ waitForBlockLocations(cluster, nn1, TEST_FILE, 1);
+
+ nn1.getRpcServer().rollEditLog();
+
+ LOG.info("Waiting for lowered replication to show up on standby");
+ waitForBlockLocations(cluster, nn2, TEST_FILE, 1);
+
+ // Change back to 3
+ LOG.info("Changing replication to 3");
+ fs.setReplication(TEST_FILE_PATH, (short)3);
+ nn1.getRpcServer().rollEditLog();
+
+ LOG.info("Waiting for higher replication to show up on standby");
+ waitForBlockLocations(cluster, nn2, TEST_FILE, 3);
+
+ } finally {
+ verify(mockRuntime, times(0)).exit(anyInt());
+ cluster.shutdown();
+ }
+ }
+
+ /**
+ * Regression test for HDFS-2795:
+ * - Start an HA cluster with a DN.
+ * - Write several blocks to the FS with replication 1.
+ * - Shutdown the DN
+ * - Wait for the NNs to declare the DN dead. All blocks will be under-replicated.
+ * - Restart the DN.
+ * In the bug, the standby node would only very slowly notice the blocks returning
+ * to the cluster.
+ */
+ @Test
+ public void testDatanodeRestarts() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
+ // We read from the standby to watch block locations
+ HAUtil.setAllowStandbyReads(conf, true);
+ conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+ MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology())
+ .numDataNodes(1)
+ .build();
+ try {
+ NameNode nn0 = cluster.getNameNode(0);
+ NameNode nn1 = cluster.getNameNode(1);
+
+ cluster.transitionToActive(0);
+
+ // Create 5 blocks.
+ DFSTestUtil.createFile(cluster.getFileSystem(0),
+ TEST_FILE_PATH, 5*1024, (short)1, 1L);
+
+ HATestUtil.waitForStandbyToCatchUp(nn0, nn1);
+
+ // Stop the DN.
+ DataNode dn = cluster.getDataNodes().get(0);
+ String dnName = dn.getDatanodeId().getName();
+ DataNodeProperties dnProps = cluster.stopDataNode(0);
+
+ // Make sure both NNs register it as dead.
+ BlockManagerTestUtil.noticeDeadDatanode(nn0, dnName);
+ BlockManagerTestUtil.noticeDeadDatanode(nn1, dnName);
+
+ BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ assertEquals(5, nn0.getNamesystem().getUnderReplicatedBlocks());
+
+ // The SBN will not have any blocks in its neededReplication queue
+ // since the SBN doesn't process replication.
+ assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
+
+ LocatedBlocks locs = nn1.getRpcServer().getBlockLocations(
+ TEST_FILE, 0, 1);
+ assertEquals("Standby should have registered that the block has no replicas",
+ 0, locs.get(0).getLocations().length);
+
+ cluster.restartDataNode(dnProps);
+ // Wait for both NNs to re-register the DN.
+ cluster.waitActive(0);
+ cluster.waitActive(1);
+
+ BlockManagerTestUtil.updateState(nn0.getNamesystem().getBlockManager());
+ BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
+ assertEquals(0, nn0.getNamesystem().getUnderReplicatedBlocks());
+ assertEquals(0, nn1.getNamesystem().getUnderReplicatedBlocks());
+
+ locs = nn1.getRpcServer().getBlockLocations(
+ TEST_FILE, 0, 1);
+ assertEquals("Standby should have registered that the block has replicas again",
+ 1, locs.get(0).getLocations().length);
+ } finally {
+ cluster.shutdown();
+ }
+ }
+
+ static void waitForBlockLocations(final MiniDFSCluster cluster,
+ final NameNode nn,
+ final String path, final int expectedReplicas)
+ throws Exception {
+ GenericTestUtils.waitFor(new Supplier<Boolean>() {
+
+ @Override
+ public Boolean get() {
+ try {
+ LocatedBlocks locs = NameNodeAdapter.getBlockLocations(nn, path, 0, 1000);
+ DatanodeInfo[] dnis = locs.getLastLocatedBlock().getLocations();
+ for (DatanodeInfo dni : dnis) {
+ Assert.assertNotNull(dni);
+ }
+ int numReplicas = dnis.length;
+
+ LOG.info("Got " + numReplicas + " locs: " + locs);
+ if (numReplicas > expectedReplicas) {
+ for (DataNode dn : cluster.getDataNodes()) {
+ DataNodeAdapter.triggerDeletionReport(dn);
+ }
+ }
+ return numReplicas == expectedReplicas;
+ } catch (IOException e) {
+ LOG.warn("No block locations yet: " + e.getMessage());
+ return false;
+ }
+ }
+ }, 500, 10000);
+
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
index c993f6c..79c7047 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
@@ -163,8 +163,13 @@
// Corrupt first replica of the block
LocatedBlock block = NameNodeAdapter.getBlockLocations(
cluster.getNameNode(), file.toString(), 0, 1).get(0);
- bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
- "TEST");
+ cluster.getNamesystem().writeLock();
+ try {
+ bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
+ "TEST");
+ } finally {
+ cluster.getNamesystem().writeUnlock();
+ }
Thread.sleep(1000); // Wait for block to be marked corrupt
MetricsRecordBuilder rb = getMetrics(NS_METRICS);
assertGauge("CorruptBlocks", 1L, rb);
@@ -202,8 +207,13 @@
// Corrupt the only replica of the block to result in a missing block
LocatedBlock block = NameNodeAdapter.getBlockLocations(
cluster.getNameNode(), file.toString(), 0, 1).get(0);
- bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
- "TEST");
+ cluster.getNamesystem().writeLock();
+ try {
+ bm.findAndMarkBlockAsCorrupt(block.getBlock(), block.getLocations()[0],
+ "TEST");
+ } finally {
+ cluster.getNamesystem().writeUnlock();
+ }
Thread.sleep(1000); // Wait for block to be marked corrupt
MetricsRecordBuilder rb = getMetrics(NS_METRICS);
assertGauge("UnderReplicatedBlocks", 1L, rb);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java
new file mode 100644
index 0000000..355009a
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdmin.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.tools;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
+import org.apache.hadoop.ha.HealthCheckFailedException;
+import org.apache.hadoop.ha.NodeFencer;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+import static org.mockito.Mockito.when;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Joiner;
+
+public class TestDFSHAAdmin {
+ private static final Log LOG = LogFactory.getLog(TestDFSHAAdmin.class);
+
+ private DFSHAAdmin tool;
+ private ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream();
+ private String errOutput;
+ private HAServiceProtocol mockProtocol;
+
+ private static final String NSID = "ns1";
+ private static String HOST_A = "1.2.3.1";
+ private static String HOST_B = "1.2.3.2";
+
+ private HdfsConfiguration getHAConf() {
+ HdfsConfiguration conf = new HdfsConfiguration();
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICES, NSID);
+ conf.set(DFSConfigKeys.DFS_FEDERATION_NAMESERVICE_ID, NSID);
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX, NSID), "nn1,nn2");
+ conf.set(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, "nn1");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, NSID, "nn1"),
+ HOST_A + ":12345");
+ conf.set(DFSUtil.addKeySuffixes(
+ DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY, NSID, "nn2"),
+ HOST_B + ":12345");
+ return conf;
+ }
+
+ @Before
+ public void setup() throws IOException {
+ mockProtocol = Mockito.mock(HAServiceProtocol.class);
+ when(mockProtocol.readyToBecomeActive()).thenReturn(true);
+ tool = new DFSHAAdmin() {
+ @Override
+ protected HAServiceProtocol getProtocol(String serviceId) throws IOException {
+ getServiceAddr(serviceId);
+ return mockProtocol;
+ }
+ };
+ tool.setConf(getHAConf());
+ tool.setErrOut(new PrintStream(errOutBytes));
+ }
+
+ private void assertOutputContains(String string) {
+ if (!errOutput.contains(string)) {
+ fail("Expected output to contain '" + string + "' but was:\n" +
+ errOutput);
+ }
+ }
+
+ @Test
+ public void testNameserviceOption() throws Exception {
+ assertEquals(-1, runTool("-ns"));
+ assertOutputContains("Missing nameservice ID");
+ assertEquals(-1, runTool("-ns", "ns1"));
+ assertOutputContains("Missing command");
+ // "ns1" isn't defined but we check this lazily and help doesn't use the ns
+ assertEquals(0, runTool("-ns", "ns1", "-help", "transitionToActive"));
+ assertOutputContains("Transitions the service into Active");
+ }
+
+ @Test
+ public void testNamenodeResolution() throws Exception {
+ assertEquals(0, runTool("-getServiceState", "nn1"));
+ Mockito.verify(mockProtocol).getServiceState();
+ assertEquals(-1, runTool("-getServiceState", "undefined"));
+ assertOutputContains(
+ "Unable to determine service address for namenode 'undefined'");
+ }
+
+ @Test
+ public void testHelp() throws Exception {
+ assertEquals(-1, runTool("-help"));
+ assertEquals(0, runTool("-help", "transitionToActive"));
+ assertOutputContains("Transitions the service into Active");
+ }
+
+ @Test
+ public void testTransitionToActive() throws Exception {
+ assertEquals(0, runTool("-transitionToActive", "nn1"));
+ Mockito.verify(mockProtocol).transitionToActive();
+ }
+
+ @Test
+ public void testTransitionToStandby() throws Exception {
+ assertEquals(0, runTool("-transitionToStandby", "nn1"));
+ Mockito.verify(mockProtocol).transitionToStandby();
+ }
+
+ @Test
+ public void testFailoverWithNoFencerConfigured() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ assertEquals(-1, runTool("-failover", "nn1", "nn2"));
+ }
+
+ @Test
+ public void testFailoverWithFencerConfigured() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-failover", "nn1", "nn2"));
+ }
+
+ @Test
+ public void testFailoverWithFencerAndNameservice() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-ns", "ns1", "-failover", "nn1", "nn2"));
+ }
+
+ @Test
+ public void testFailoverWithFencerConfiguredAndForce() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-failover", "nn1", "nn2", "--forcefence"));
+ }
+
+ @Test
+ public void testFailoverWithForceActive() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-failover", "nn1", "nn2", "--forceactive"));
+ }
+
+ @Test
+ public void testFailoverWithInvalidFenceArg() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(-1, runTool("-failover", "nn1", "nn2", "notforcefence"));
+ }
+
+ @Test
+ public void testFailoverWithFenceButNoFencer() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence"));
+ }
+
+ @Test
+ public void testFailoverWithFenceAndBadFencer() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "foobar!");
+ tool.setConf(conf);
+ assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence"));
+ }
+
+ @Test
+ public void testForceFenceOptionListedBeforeArgs() throws Exception {
+ Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
+ HdfsConfiguration conf = getHAConf();
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-failover", "--forcefence", "nn1", "nn2"));
+ }
+
+ @Test
+ public void testGetServiceState() throws Exception {
+ assertEquals(0, runTool("-getServiceState", "nn1"));
+ Mockito.verify(mockProtocol).getServiceState();
+ }
+
+ @Test
+ public void testCheckHealth() throws Exception {
+ assertEquals(0, runTool("-checkHealth", "nn1"));
+ Mockito.verify(mockProtocol).monitorHealth();
+
+ Mockito.doThrow(new HealthCheckFailedException("fake health check failure"))
+ .when(mockProtocol).monitorHealth();
+ assertEquals(-1, runTool("-checkHealth", "nn1"));
+ assertOutputContains("Health check failed: fake health check failure");
+ }
+
+ private Object runTool(String ... args) throws Exception {
+ errOutBytes.reset();
+ LOG.info("Running: DFSHAAdmin " + Joiner.on(" ").join(args));
+ int ret = tool.run(args);
+ errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8);
+ LOG.info("Output:\n" + errOutput);
+ return ret;
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java
new file mode 100644
index 0000000..0302c8e
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.tools;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.ha.NodeFencer;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Joiner;
+
+/**
+ * Tests for HAAdmin command with {@link MiniDFSCluster} set up in HA mode.
+ */
+public class TestDFSHAAdminMiniCluster {
+ private static final Log LOG = LogFactory.getLog(TestDFSHAAdminMiniCluster.class);
+
+ private MiniDFSCluster cluster;
+ private Configuration conf;
+ private DFSHAAdmin tool;
+
+ @Before
+ public void setup() throws IOException {
+ conf = new Configuration();
+ cluster = new MiniDFSCluster.Builder(conf)
+ .nnTopology(MiniDFSNNTopology.simpleHATopology()).numDataNodes(0)
+ .build();
+ tool = new DFSHAAdmin();
+ tool.setConf(conf);
+ cluster.waitActive();
+ }
+
+ @After
+ public void shutdown() throws Exception {
+ cluster.shutdown();
+ }
+
+ @Test
+ public void testGetServiceState() throws Exception {
+ assertEquals(0, runTool("-getServiceState", "nn1"));
+ assertEquals(0, runTool("-getServiceState", "nn2"));
+ }
+
+ @Test
+ public void testStateTransition() throws Exception {
+ NameNode nnode1 = cluster.getNameNode(0);
+ assertTrue(nnode1.isStandbyState());
+ assertEquals(0, runTool("-transitionToActive", "nn1"));
+ assertFalse(nnode1.isStandbyState());
+ assertEquals(0, runTool("-transitionToStandby", "nn1"));
+ assertTrue(nnode1.isStandbyState());
+
+ NameNode nnode2 = cluster.getNameNode(1);
+ assertTrue(nnode2.isStandbyState());
+ assertEquals(0, runTool("-transitionToActive", "nn2"));
+ assertFalse(nnode2.isStandbyState());
+ assertEquals(0, runTool("-transitionToStandby", "nn2"));
+ assertTrue(nnode2.isStandbyState());
+ }
+
+ /**
+ * Test failover with various options
+ */
+ @Test
+ public void testFencer() throws Exception {
+ // Test failover with no fencer
+ assertEquals(-1, runTool("-failover", "nn1", "nn2"));
+
+ // Test failover with fencer
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-transitionToActive", "nn1"));
+ assertEquals(0, runTool("-failover", "nn1", "nn2"));
+
+ // Test failover with fencer and nameservice
+ assertEquals(0, runTool("-ns", "minidfs-ns", "-failover", "nn2", "nn1"));
+
+ // Test failover with fencer and forcefence option
+ assertEquals(0, runTool("-failover", "nn1", "nn2", "--forcefence"));
+
+ // Test failover with forceactive option
+ assertEquals(0, runTool("-failover", "nn2", "nn1", "--forceactive"));
+
+ // Test failover with not fencer and forcefence option
+ conf.unset(NodeFencer.CONF_METHODS_KEY);
+ tool.setConf(conf);
+ assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence"));
+
+ // Test failover with bad fencer and forcefence option
+ conf.set(NodeFencer.CONF_METHODS_KEY, "foobar!");
+ tool.setConf(conf);
+ assertEquals(-1, runTool("-failover", "nn1", "nn2", "--forcefence"));
+
+ // Test failover with force fence listed before the other arguments
+ conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
+ tool.setConf(conf);
+ assertEquals(0, runTool("-failover", "--forcefence", "nn1", "nn2"));
+ }
+
+ @Test
+ public void testCheckHealth() throws Exception {
+ assertEquals(0, runTool("-checkHealth", "nn1"));
+ assertEquals(0, runTool("-checkHealth", "nn2"));
+ }
+
+ private int runTool(String ... args) throws Exception {
+ ByteArrayOutputStream errOutBytes = new ByteArrayOutputStream();
+ errOutBytes.reset();
+ LOG.info("Running: DFSHAAdmin " + Joiner.on(" ").join(args));
+ int ret = tool.run(args);
+ String errOutput = new String(errOutBytes.toByteArray(), Charsets.UTF_8);
+ LOG.info("Output:\n" + errOutput);
+ return ret;
+ }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java
index 7152e12..97be2b8 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestGetConf.java
@@ -24,6 +24,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
+import java.util.Map;
import java.util.StringTokenizer;
import static org.junit.Assert.*;
@@ -32,6 +33,7 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.DFSUtil.ConfiguredNNAddress;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.tools.GetConf;
import org.apache.hadoop.hdfs.tools.GetConf.Command;
@@ -72,7 +74,7 @@
String[] values = new String[nameServiceIdCount];
for (int i = 0; i < nameServiceIdCount; i++, portOffset++) {
String nsID = getNameServiceId(i);
- String specificKey = DFSUtil.getNameServiceIdKey(key, nsID);
+ String specificKey = DFSUtil.addKeySuffixes(key, nsID);
values[i] = "nn" + i + ":" + portOffset;
conf.set(specificKey, values[i]);
}
@@ -80,13 +82,13 @@
}
/*
- * Convert list of InetSocketAddress to string array with each address
- * represented as "host:port"
+ * Convert the map returned from DFSUtil functions to an array of
+ * addresses represented as "host:port"
*/
- private String[] toStringArray(List<InetSocketAddress> list) {
+ private String[] toStringArray(List<ConfiguredNNAddress> list) {
String[] ret = new String[list.size()];
for (int i = 0; i < list.size(); i++) {
- ret[i] = NetUtils.getHostPortString(list.get(i));
+ ret[i] = NetUtils.getHostPortString(list.get(i).getAddress());
}
return ret;
}
@@ -94,8 +96,8 @@
/**
* Using DFSUtil methods get the list of given {@code type} of address
*/
- private List<InetSocketAddress> getAddressListFromConf(TestType type,
- HdfsConfiguration conf) throws IOException {
+ private Map<String, Map<String, InetSocketAddress>> getAddressListFromConf(
+ TestType type, HdfsConfiguration conf) throws IOException {
switch (type) {
case NAMENODE:
return DFSUtil.getNNServiceRpcAddresses(conf);
@@ -161,7 +163,7 @@
* @param expected, expected addresses
*/
private void getAddressListFromTool(TestType type, HdfsConfiguration conf,
- boolean checkPort, List<InetSocketAddress> expected) throws Exception {
+ boolean checkPort, List<ConfiguredNNAddress> expected) throws Exception {
String out = getAddressListFromTool(type, conf, expected.size() != 0);
List<String> values = new ArrayList<String>();
@@ -176,7 +178,8 @@
// Convert expected list to String[] of hosts
int i = 0;
String[] expectedHosts = new String[expected.size()];
- for (InetSocketAddress addr : expected) {
+ for (ConfiguredNNAddress cnn : expected) {
+ InetSocketAddress addr = cnn.getAddress();
if (!checkPort) {
expectedHosts[i++] = addr.getHostName();
}else {
@@ -191,7 +194,9 @@
private void verifyAddresses(HdfsConfiguration conf, TestType type,
boolean checkPort, String... expected) throws Exception {
// Ensure DFSUtil returned the right set of addresses
- List<InetSocketAddress> list = getAddressListFromConf(type, conf);
+ Map<String, Map<String, InetSocketAddress>> map =
+ getAddressListFromConf(type, conf);
+ List<ConfiguredNNAddress> list = DFSUtil.flattenAddressMap(map);
String[] actual = toStringArray(list);
Arrays.sort(actual);
Arrays.sort(expected);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java
index 966e52f..5d3272a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/web/TestWebHdfsWithMultipleNameNodes.java
@@ -31,6 +31,7 @@
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager;
@@ -79,7 +80,7 @@
conf.setBoolean(DFSConfigKeys.DFS_WEBHDFS_ENABLED_KEY, true);
cluster = new MiniDFSCluster.Builder(conf)
- .numNameNodes(nNameNodes)
+ .nnTopology(MiniDFSNNTopology.simpleFederatedTopology(nNameNodes))
.numDataNodes(nDataNodes)
.build();
cluster.waitActive();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java
index 13e9683..23d1bb1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/test/GenericTestUtils.java
@@ -20,6 +20,7 @@
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
+import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeoutException;
@@ -79,8 +80,8 @@
public static void assertExceptionContains(String string, Throwable t) {
String msg = t.getMessage();
Assert.assertTrue(
- "Unexpected exception:" + StringUtils.stringifyException(t),
- msg.contains(string));
+ "Expected to find '" + string + "' but got unexpected exception:"
+ + StringUtils.stringifyException(t), msg.contains(string));
}
public static void waitFor(Supplier<Boolean> check,
@@ -109,7 +110,11 @@
private final CountDownLatch fireLatch = new CountDownLatch(1);
private final CountDownLatch waitLatch = new CountDownLatch(1);
-
+ private final CountDownLatch resultLatch = new CountDownLatch(1);
+
+ // Result fields set after proceed() is called.
+ private volatile Throwable thrown;
+ private volatile Object returnValue;
public DelayAnswer(Log log) {
this.LOG = log;
@@ -144,7 +149,40 @@
}
protected Object passThrough(InvocationOnMock invocation) throws Throwable {
- return invocation.callRealMethod();
+ try {
+ Object ret = invocation.callRealMethod();
+ returnValue = ret;
+ return ret;
+ } catch (Throwable t) {
+ thrown = t;
+ throw t;
+ } finally {
+ resultLatch.countDown();
+ }
+ }
+
+ /**
+ * After calling proceed(), this will wait until the call has
+ * completed and a result has been returned to the caller.
+ */
+ public void waitForResult() throws InterruptedException {
+ resultLatch.await();
+ }
+
+ /**
+ * After the call has gone through, return any exception that
+ * was thrown, or null if no exception was thrown.
+ */
+ public Throwable getThrown() {
+ return thrown;
+ }
+
+ /**
+ * After the call has gone through, return the call's return value,
+ * or null in case it was void or an exception was thrown.
+ */
+ public Object getReturnValue() {
+ return returnValue;
}
}
@@ -176,4 +214,35 @@
}
}
+ /**
+ * An Answer implementation which sleeps for a random number of milliseconds
+ * between 0 and a configurable value before delegating to the real
+ * implementation of the method. This can be useful for drawing out race
+ * conditions.
+ */
+ public static class SleepAnswer implements Answer<Object> {
+ private final int maxSleepTime;
+ private static Random r = new Random();
+
+ public SleepAnswer(int maxSleepTime) {
+ this.maxSleepTime = maxSleepTime;
+ }
+
+ @Override
+ public Object answer(InvocationOnMock invocation) throws Throwable {
+ boolean interrupted = false;
+ try {
+ Thread.sleep(r.nextInt(maxSleepTime));
+ } catch (InterruptedException ie) {
+ interrupted = true;
+ }
+ try {
+ return invocation.callRealMethod();
+ } finally {
+ if (interrupted) {
+ Thread.currentThread().interrupt();
+ }
+ }
+ }
+ }
}
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored
index 0101672..5099ce2 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored
Binary files differ
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml
index 65fe23a..acc34bb 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/editsStored.xml
@@ -1,34 +1,34 @@
<?xml version="1.0"?>
<EDITS>
- <EDITS_VERSION>-38</EDITS_VERSION>
+ <EDITS_VERSION>-40</EDITS_VERSION>
<RECORD>
<OPCODE>24</OPCODE>
<DATA>
<TRANSACTION_ID>1</TRANSACTION_ID>
</DATA>
- <CHECKSUM>1504643968</CHECKSUM>
+ <CHECKSUM>-2045328303</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>21</OPCODE>
<DATA>
<TRANSACTION_ID>2</TRANSACTION_ID>
<KEY_ID>1</KEY_ID>
- <KEY_EXPIRY_DATE>1304751257518</KEY_EXPIRY_DATE>
+ <KEY_EXPIRY_DATE>1331096884634</KEY_EXPIRY_DATE>
<KEY_LENGTH>3</KEY_LENGTH>
- <KEY_BLOB>2FhO</KEY_BLOB>
+ <KEY_BLOB>o0v1</KEY_BLOB>
</DATA>
- <CHECKSUM>-174778556</CHECKSUM>
+ <CHECKSUM>-1521490291</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>21</OPCODE>
<DATA>
<TRANSACTION_ID>3</TRANSACTION_ID>
<KEY_ID>2</KEY_ID>
- <KEY_EXPIRY_DATE>1304751257521</KEY_EXPIRY_DATE>
+ <KEY_EXPIRY_DATE>1331096884637</KEY_EXPIRY_DATE>
<KEY_LENGTH>3</KEY_LENGTH>
- <KEY_BLOB>77-r</KEY_BLOB>
+ <KEY_BLOB>3WMF</KEY_BLOB>
</DATA>
- <CHECKSUM>1565957291</CHECKSUM>
+ <CHECKSUM>65546244</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>10</OPCODE>
@@ -42,11 +42,10 @@
<OPCODE>0</OPCODE>
<DATA>
<TRANSACTION_ID>5</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
<PATH>/file_create</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057562</MTIME>
- <ATIME>1304060057562</ATIME>
+ <MTIME>1330405685834</MTIME>
+ <ATIME>1330405685834</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -54,20 +53,19 @@
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_NONMAPREDUCE_-66857152_1</CLIENT_NAME>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
<CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
</DATA>
- <CHECKSUM>-1854451489</CHECKSUM>
+ <CHECKSUM>179250704</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>9</OPCODE>
<DATA>
<TRANSACTION_ID>6</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
<PATH>/file_create</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057572</MTIME>
- <ATIME>1304060057562</ATIME>
+ <MTIME>1330405685848</MTIME>
+ <ATIME>1330405685834</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -76,44 +74,41 @@
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
</DATA>
- <CHECKSUM>617592855</CHECKSUM>
+ <CHECKSUM>-584136658</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>1</OPCODE>
<DATA>
<TRANSACTION_ID>7</TRANSACTION_ID>
- <LENGTH>3</LENGTH>
<SOURCE>/file_create</SOURCE>
<DESTINATION>/file_moved</DESTINATION>
- <TIMESTAMP>1304060057575</TIMESTAMP>
+ <TIMESTAMP>1330405685852</TIMESTAMP>
</DATA>
- <CHECKSUM>367100554</CHECKSUM>
+ <CHECKSUM>-1983534581</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>2</OPCODE>
<DATA>
<TRANSACTION_ID>8</TRANSACTION_ID>
- <LENGTH>2</LENGTH>
<PATH>/file_moved</PATH>
- <TIMESTAMP>1304060057577</TIMESTAMP>
+ <TIMESTAMP>1330405685857</TIMESTAMP>
</DATA>
- <CHECKSUM>1048346698</CHECKSUM>
+ <CHECKSUM>-97648053</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>3</OPCODE>
<DATA>
<TRANSACTION_ID>9</TRANSACTION_ID>
- <LENGTH>3</LENGTH>
<PATH>/directory_mkdir</PATH>
- <TIMESTAMP>1304060057581</TIMESTAMP>
- <ATIME>0</ATIME>
+ <TIMESTAMP>1330405685861</TIMESTAMP>
+ <ATIME>1330405685861</ATIME>
<PERMISSION_STATUS>
<USERNAME>todd</USERNAME>
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>493</FS_PERMISSIONS>
</PERMISSION_STATUS>
</DATA>
- <CHECKSUM>1207240248</CHECKSUM>
+ <CHECKSUM>-146811985</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>10</OPCODE>
@@ -127,11 +122,10 @@
<OPCODE>0</OPCODE>
<DATA>
<TRANSACTION_ID>11</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
<PATH>/file_create</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057584</MTIME>
- <ATIME>1304060057584</ATIME>
+ <MTIME>1330405685866</MTIME>
+ <ATIME>1330405685866</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -139,20 +133,19 @@
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_NONMAPREDUCE_-66857152_1</CLIENT_NAME>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
<CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
</DATA>
- <CHECKSUM>1796314473</CHECKSUM>
+ <CHECKSUM>806955943</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>9</OPCODE>
<DATA>
<TRANSACTION_ID>12</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
<PATH>/file_create</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057588</MTIME>
- <ATIME>1304060057584</ATIME>
+ <MTIME>1330405685868</MTIME>
+ <ATIME>1330405685866</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -161,7 +154,7 @@
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
</DATA>
- <CHECKSUM>1017626905</CHECKSUM>
+ <CHECKSUM>641893387</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>4</OPCODE>
@@ -170,7 +163,7 @@
<PATH>/file_create</PATH>
<REPLICATION>1</REPLICATION>
</DATA>
- <CHECKSUM>1842610087</CHECKSUM>
+ <CHECKSUM>24198146</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>7</OPCODE>
@@ -195,12 +188,11 @@
<OPCODE>13</OPCODE>
<DATA>
<TRANSACTION_ID>16</TRANSACTION_ID>
- <LENGTH>3</LENGTH>
<PATH>/file_create</PATH>
<MTIME>1285195527000</MTIME>
<ATIME>1285195527000</ATIME>
</DATA>
- <CHECKSUM>1428793678</CHECKSUM>
+ <CHECKSUM>1853168961</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>14</OPCODE>
@@ -216,13 +208,12 @@
<OPCODE>15</OPCODE>
<DATA>
<TRANSACTION_ID>18</TRANSACTION_ID>
- <LENGTH>3</LENGTH>
<SOURCE>/file_create</SOURCE>
<DESTINATION>/file_moved</DESTINATION>
- <TIMESTAMP>1304060057605</TIMESTAMP>
+ <TIMESTAMP>1330405685882</TIMESTAMP>
<RENAME_OPTIONS>AA</RENAME_OPTIONS>
</DATA>
- <CHECKSUM>-1155144192</CHECKSUM>
+ <CHECKSUM>-1235158297</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>10</OPCODE>
@@ -236,11 +227,10 @@
<OPCODE>0</OPCODE>
<DATA>
<TRANSACTION_ID>20</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
<PATH>/file_concat_target</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057613</MTIME>
- <ATIME>1304060057613</ATIME>
+ <MTIME>1330405685889</MTIME>
+ <ATIME>1330405685889</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -248,125 +238,141 @@
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_NONMAPREDUCE_-66857152_1</CLIENT_NAME>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
<CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
</DATA>
- <CHECKSUM>-428545606</CHECKSUM>
- </RECORD>
- <RECORD>
- <OPCODE>9</OPCODE>
- <DATA>
- <TRANSACTION_ID>21</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/file_concat_target</PATH>
- <REPLICATION>1</REPLICATION>
- <MTIME>1304060057694</MTIME>
- <ATIME>1304060057613</ATIME>
- <BLOCKSIZE>512</BLOCKSIZE>
- <NUMBLOCKS>3</NUMBLOCKS>
- <BLOCK>
- <BLOCK_ID>3459038074990663911</BLOCK_ID>
- <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1003</BLOCK_GENERATION_STAMP>
- </BLOCK>
- <BLOCK>
- <BLOCK_ID>-5555244278278879146</BLOCK_ID>
- <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1003</BLOCK_GENERATION_STAMP>
- </BLOCK>
- <BLOCK>
- <BLOCK_ID>-6344128791846831740</BLOCK_ID>
- <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1003</BLOCK_GENERATION_STAMP>
- </BLOCK>
- <PERMISSION_STATUS>
- <USERNAME>todd</USERNAME>
- <GROUPNAME>supergroup</GROUPNAME>
- <FS_PERMISSIONS>420</FS_PERMISSIONS>
- </PERMISSION_STATUS>
- </DATA>
- <CHECKSUM>707995174</CHECKSUM>
+ <CHECKSUM>-981119572</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>10</OPCODE>
<DATA>
- <TRANSACTION_ID>22</TRANSACTION_ID>
+ <TRANSACTION_ID>21</TRANSACTION_ID>
<GENERATION_STAMP>1004</GENERATION_STAMP>
</DATA>
- <CHECKSUM>-1500977009</CHECKSUM>
+ <CHECKSUM>-1627007926</CHECKSUM>
</RECORD>
<RECORD>
- <OPCODE>0</OPCODE>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>22</TRANSACTION_ID>
+ <PATH>/file_concat_target</PATH>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7144805496741076283</BLOCK_ID>
+ <BLOCK_NUM_BYTES>0</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1131701615</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
<DATA>
<TRANSACTION_ID>23</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/file_concat_0</PATH>
- <REPLICATION>1</REPLICATION>
- <MTIME>1304060057701</MTIME>
- <ATIME>1304060057701</ATIME>
- <BLOCKSIZE>512</BLOCKSIZE>
- <NUMBLOCKS>0</NUMBLOCKS>
- <PERMISSION_STATUS>
- <USERNAME>todd</USERNAME>
- <GROUPNAME>supergroup</GROUPNAME>
- <FS_PERMISSIONS>420</FS_PERMISSIONS>
- </PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_NONMAPREDUCE_-66857152_1</CLIENT_NAME>
- <CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
+ <GENERATION_STAMP>1005</GENERATION_STAMP>
</DATA>
- <CHECKSUM>-119850856</CHECKSUM>
+ <CHECKSUM>-957035430</CHECKSUM>
</RECORD>
<RECORD>
- <OPCODE>9</OPCODE>
+ <OPCODE>25</OPCODE>
<DATA>
<TRANSACTION_ID>24</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/file_concat_0</PATH>
- <REPLICATION>1</REPLICATION>
- <MTIME>1304060057737</MTIME>
- <ATIME>1304060057701</ATIME>
- <BLOCKSIZE>512</BLOCKSIZE>
- <NUMBLOCKS>3</NUMBLOCKS>
+ <PATH>/file_concat_target</PATH>
+ <NUMBLOCKS>2</NUMBLOCKS>
<BLOCK>
- <BLOCK_ID>4671949296381030428</BLOCK_ID>
+ <BLOCK_ID>-7144805496741076283</BLOCK_ID>
<BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
<BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
</BLOCK>
<BLOCK>
- <BLOCK_ID>-844362243522407159</BLOCK_ID>
- <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
+ <BLOCK_ID>-4125931756867080767</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
</BLOCK>
- <BLOCK>
- <BLOCK_ID>3476886462779656950</BLOCK_ID>
- <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
- </BLOCK>
- <PERMISSION_STATUS>
- <USERNAME>todd</USERNAME>
- <GROUPNAME>supergroup</GROUPNAME>
- <FS_PERMISSIONS>420</FS_PERMISSIONS>
- </PERMISSION_STATUS>
</DATA>
- <CHECKSUM>-766805874</CHECKSUM>
+ <CHECKSUM>-932985519</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>10</OPCODE>
<DATA>
<TRANSACTION_ID>25</TRANSACTION_ID>
- <GENERATION_STAMP>1005</GENERATION_STAMP>
+ <GENERATION_STAMP>1006</GENERATION_STAMP>
</DATA>
- <CHECKSUM>238426056</CHECKSUM>
+ <CHECKSUM>-1757460878</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>26</TRANSACTION_ID>
+ <PATH>/file_concat_target</PATH>
+ <NUMBLOCKS>3</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7144805496741076283</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-4125931756867080767</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>0</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>1562413691487277050</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-154090859</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>9</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>27</TRANSACTION_ID>
+ <PATH>/file_concat_target</PATH>
+ <REPLICATION>1</REPLICATION>
+ <MTIME>1330405685978</MTIME>
+ <ATIME>1330405685889</ATIME>
+ <BLOCKSIZE>512</BLOCKSIZE>
+ <NUMBLOCKS>3</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7144805496741076283</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1004</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-4125931756867080767</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1005</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>1562413691487277050</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1006</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <PERMISSION_STATUS>
+ <USERNAME>todd</USERNAME>
+ <GROUPNAME>supergroup</GROUPNAME>
+ <FS_PERMISSIONS>420</FS_PERMISSIONS>
+ </PERMISSION_STATUS>
+ </DATA>
+ <CHECKSUM>-292633850</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>28</TRANSACTION_ID>
+ <GENERATION_STAMP>1007</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-1431358549</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>0</OPCODE>
<DATA>
- <TRANSACTION_ID>26</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/file_concat_1</PATH>
+ <TRANSACTION_ID>29</TRANSACTION_ID>
+ <PATH>/file_concat_0</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057742</MTIME>
- <ATIME>1304060057742</ATIME>
+ <MTIME>1330405685983</MTIME>
+ <ATIME>1330405685983</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
@@ -374,36 +380,116 @@
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_NONMAPREDUCE_-66857152_1</CLIENT_NAME>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
<CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
</DATA>
- <CHECKSUM>1156254705</CHECKSUM>
+ <CHECKSUM>-318194869</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>30</TRANSACTION_ID>
+ <GENERATION_STAMP>1008</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>156309208</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>31</TRANSACTION_ID>
+ <PATH>/file_concat_0</PATH>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>6084289468290363112</BLOCK_ID>
+ <BLOCK_NUM_BYTES>0</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1008</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-596016492</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>32</TRANSACTION_ID>
+ <GENERATION_STAMP>1009</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-1734001394</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>33</TRANSACTION_ID>
+ <PATH>/file_concat_0</PATH>
+ <NUMBLOCKS>2</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>6084289468290363112</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1008</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-4219431127125026105</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>1352178323</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>34</TRANSACTION_ID>
+ <GENERATION_STAMP>1010</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>794444850</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>35</TRANSACTION_ID>
+ <PATH>/file_concat_0</PATH>
+ <NUMBLOCKS>3</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>6084289468290363112</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1008</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-4219431127125026105</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>0</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-1765119074945211374</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1530696539</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>9</OPCODE>
<DATA>
- <TRANSACTION_ID>27</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/file_concat_1</PATH>
+ <TRANSACTION_ID>36</TRANSACTION_ID>
+ <PATH>/file_concat_0</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1304060057764</MTIME>
- <ATIME>1304060057742</ATIME>
+ <MTIME>1330405686013</MTIME>
+ <ATIME>1330405685983</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>3</NUMBLOCKS>
<BLOCK>
- <BLOCK_ID>-754893470864399741</BLOCK_ID>
+ <BLOCK_ID>6084289468290363112</BLOCK_ID>
<BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1005</BLOCK_GENERATION_STAMP>
+ <BLOCK_GENERATION_STAMP>1008</BLOCK_GENERATION_STAMP>
</BLOCK>
<BLOCK>
- <BLOCK_ID>1820875380010181049</BLOCK_ID>
+ <BLOCK_ID>-4219431127125026105</BLOCK_ID>
<BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1005</BLOCK_GENERATION_STAMP>
+ <BLOCK_GENERATION_STAMP>1009</BLOCK_GENERATION_STAMP>
</BLOCK>
<BLOCK>
- <BLOCK_ID>8266387560744259971</BLOCK_ID>
+ <BLOCK_ID>-1765119074945211374</BLOCK_ID>
<BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
- <BLOCK_GENERATION_STAMP>1005</BLOCK_GENERATION_STAMP>
+ <BLOCK_GENERATION_STAMP>1010</BLOCK_GENERATION_STAMP>
</BLOCK>
<PERMISSION_STATUS>
<USERNAME>todd</USERNAME>
@@ -411,121 +497,336 @@
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
</DATA>
- <CHECKSUM>-654780301</CHECKSUM>
+ <CHECKSUM>-2043978220</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>37</TRANSACTION_ID>
+ <GENERATION_STAMP>1011</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>1010571629</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>0</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>38</TRANSACTION_ID>
+ <PATH>/file_concat_1</PATH>
+ <REPLICATION>1</REPLICATION>
+ <MTIME>1330405686017</MTIME>
+ <ATIME>1330405686017</ATIME>
+ <BLOCKSIZE>512</BLOCKSIZE>
+ <NUMBLOCKS>0</NUMBLOCKS>
+ <PERMISSION_STATUS>
+ <USERNAME>todd</USERNAME>
+ <GROUPNAME>supergroup</GROUPNAME>
+ <FS_PERMISSIONS>420</FS_PERMISSIONS>
+ </PERMISSION_STATUS>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
+ <CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
+ </DATA>
+ <CHECKSUM>-501297097</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>39</TRANSACTION_ID>
+ <GENERATION_STAMP>1012</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-1934711736</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>40</TRANSACTION_ID>
+ <PATH>/file_concat_1</PATH>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7448471719302683860</BLOCK_ID>
+ <BLOCK_NUM_BYTES>0</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1012</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1853122907</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>41</TRANSACTION_ID>
+ <GENERATION_STAMP>1013</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>862670668</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>42</TRANSACTION_ID>
+ <PATH>/file_concat_1</PATH>
+ <NUMBLOCKS>2</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7448471719302683860</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1012</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-8051065559769974521</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1169706939</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>43</TRANSACTION_ID>
+ <GENERATION_STAMP>1014</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-2070661520</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>44</TRANSACTION_ID>
+ <PATH>/file_concat_1</PATH>
+ <NUMBLOCKS>3</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7448471719302683860</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1012</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-8051065559769974521</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>0</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>3808670437711973616</BLOCK_ID>
+ <BLOCK_DELTA_NUM_BYTES>-512</BLOCK_DELTA_NUM_BYTES>
+ <BLOCK_DELTA_GEN_STAMP>1</BLOCK_DELTA_GEN_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1568093815</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>9</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>45</TRANSACTION_ID>
+ <PATH>/file_concat_1</PATH>
+ <REPLICATION>1</REPLICATION>
+ <MTIME>1330405686042</MTIME>
+ <ATIME>1330405686017</ATIME>
+ <BLOCKSIZE>512</BLOCKSIZE>
+ <NUMBLOCKS>3</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-7448471719302683860</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1012</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>-8051065559769974521</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1013</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <BLOCK>
+ <BLOCK_ID>3808670437711973616</BLOCK_ID>
+ <BLOCK_NUM_BYTES>512</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1014</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <PERMISSION_STATUS>
+ <USERNAME>todd</USERNAME>
+ <GROUPNAME>supergroup</GROUPNAME>
+ <FS_PERMISSIONS>420</FS_PERMISSIONS>
+ </PERMISSION_STATUS>
+ </DATA>
+ <CHECKSUM>-1640101896</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>16</OPCODE>
<DATA>
- <TRANSACTION_ID>28</TRANSACTION_ID>
- <LENGTH>4</LENGTH>
+ <TRANSACTION_ID>46</TRANSACTION_ID>
<CONCAT_TARGET>/file_concat_target</CONCAT_TARGET>
+ <LENGTH>2</LENGTH>
<CONCAT_SOURCE>/file_concat_0</CONCAT_SOURCE>
<CONCAT_SOURCE>/file_concat_1</CONCAT_SOURCE>
- <TIMESTAMP>1304060057767</TIMESTAMP>
+ <TIMESTAMP>1330405686046</TIMESTAMP>
</DATA>
- <CHECKSUM>1273279541</CHECKSUM>
+ <CHECKSUM>2122891157</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>17</OPCODE>
<DATA>
- <TRANSACTION_ID>29</TRANSACTION_ID>
- <LENGTH>4</LENGTH>
+ <TRANSACTION_ID>47</TRANSACTION_ID>
<SOURCE>/file_symlink</SOURCE>
<DESTINATION>/file_concat_target</DESTINATION>
- <MTIME>1304060057770</MTIME>
- <ATIME>1304060057770</ATIME>
+ <MTIME>1330405686051</MTIME>
+ <ATIME>1330405686051</ATIME>
<PERMISSION_STATUS>
<USERNAME>todd</USERNAME>
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>511</FS_PERMISSIONS>
</PERMISSION_STATUS>
</DATA>
- <CHECKSUM>1385678569</CHECKSUM>
+ <CHECKSUM>-585385283</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>18</OPCODE>
<DATA>
- <TRANSACTION_ID>30</TRANSACTION_ID>
+ <TRANSACTION_ID>48</TRANSACTION_ID>
<T_VERSION>0</T_VERSION>
<T_OWNER>todd</T_OWNER>
<T_RENEWER>JobTracker</T_RENEWER>
<T_REAL_USER/>
- <T_ISSUE_DATE>1304060057773</T_ISSUE_DATE>
- <T_MAX_DATE>1304664857773</T_MAX_DATE>
+ <T_ISSUE_DATE>1330405686056</T_ISSUE_DATE>
+ <T_MAX_DATE>1331010486056</T_MAX_DATE>
<T_SEQUENCE_NUMBER>1</T_SEQUENCE_NUMBER>
<T_MASTER_KEY_ID>2</T_MASTER_KEY_ID>
- <T_EXPIRY_TIME>1304146457773</T_EXPIRY_TIME>
+ <T_EXPIRY_TIME>1330492086056</T_EXPIRY_TIME>
</DATA>
- <CHECKSUM>913145699</CHECKSUM>
+ <CHECKSUM>791321007</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>19</OPCODE>
<DATA>
- <TRANSACTION_ID>31</TRANSACTION_ID>
+ <TRANSACTION_ID>49</TRANSACTION_ID>
<T_VERSION>0</T_VERSION>
<T_OWNER>todd</T_OWNER>
<T_RENEWER>JobTracker</T_RENEWER>
<T_REAL_USER/>
- <T_ISSUE_DATE>1304060057773</T_ISSUE_DATE>
- <T_MAX_DATE>1304664857773</T_MAX_DATE>
+ <T_ISSUE_DATE>1330405686056</T_ISSUE_DATE>
+ <T_MAX_DATE>1331010486056</T_MAX_DATE>
<T_SEQUENCE_NUMBER>1</T_SEQUENCE_NUMBER>
<T_MASTER_KEY_ID>2</T_MASTER_KEY_ID>
- <T_EXPIRY_TIME>1304146457785</T_EXPIRY_TIME>
+ <T_EXPIRY_TIME>1330492086075</T_EXPIRY_TIME>
</DATA>
- <CHECKSUM>-1772039941</CHECKSUM>
+ <CHECKSUM>649714969</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>20</OPCODE>
<DATA>
- <TRANSACTION_ID>32</TRANSACTION_ID>
+ <TRANSACTION_ID>50</TRANSACTION_ID>
<T_VERSION>0</T_VERSION>
<T_OWNER>todd</T_OWNER>
<T_RENEWER>JobTracker</T_RENEWER>
<T_REAL_USER/>
- <T_ISSUE_DATE>1304060057773</T_ISSUE_DATE>
- <T_MAX_DATE>1304664857773</T_MAX_DATE>
+ <T_ISSUE_DATE>1330405686056</T_ISSUE_DATE>
+ <T_MAX_DATE>1331010486056</T_MAX_DATE>
<T_SEQUENCE_NUMBER>1</T_SEQUENCE_NUMBER>
<T_MASTER_KEY_ID>2</T_MASTER_KEY_ID>
</DATA>
- <CHECKSUM>1382094146</CHECKSUM>
+ <CHECKSUM>1190872628</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>51</TRANSACTION_ID>
+ <GENERATION_STAMP>1015</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-460593521</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>0</OPCODE>
<DATA>
- <TRANSACTION_ID>33</TRANSACTION_ID>
- <LENGTH>5</LENGTH>
- <PATH>/reassign-lease-test</PATH>
+ <TRANSACTION_ID>52</TRANSACTION_ID>
+ <PATH>/hard-lease-recovery-test</PATH>
<REPLICATION>1</REPLICATION>
- <MTIME>1286491964741</MTIME>
- <ATIME>1286491964741</ATIME>
+ <MTIME>1330405686084</MTIME>
+ <ATIME>1330405686084</ATIME>
<BLOCKSIZE>512</BLOCKSIZE>
<NUMBLOCKS>0</NUMBLOCKS>
<PERMISSION_STATUS>
- <USERNAME>atm</USERNAME>
+ <USERNAME>todd</USERNAME>
<GROUPNAME>supergroup</GROUPNAME>
<FS_PERMISSIONS>420</FS_PERMISSIONS>
</PERMISSION_STATUS>
- <CLIENT_NAME>DFSClient_871171074</CLIENT_NAME>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
<CLIENT_MACHINE>127.0.0.1</CLIENT_MACHINE>
</DATA>
- <CHECKSUM>1975140107</CHECKSUM>
+ <CHECKSUM>2093219037</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>53</TRANSACTION_ID>
+ <GENERATION_STAMP>1016</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>120488596</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>54</TRANSACTION_ID>
+ <PATH>/hard-lease-recovery-test</PATH>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-357061736603024522</BLOCK_ID>
+ <BLOCK_NUM_BYTES>0</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1016</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>2098840974</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>25</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>55</TRANSACTION_ID>
+ <PATH>/hard-lease-recovery-test</PATH>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-357061736603024522</BLOCK_ID>
+ <BLOCK_NUM_BYTES>0</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1016</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ </DATA>
+ <CHECKSUM>-1794222801</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>10</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>56</TRANSACTION_ID>
+ <GENERATION_STAMP>1017</GENERATION_STAMP>
+ </DATA>
+ <CHECKSUM>-2123999915</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>22</OPCODE>
<DATA>
- <TRANSACTION_ID>34</TRANSACTION_ID>
- <CLIENT_NAME>DFSClient_871171074</CLIENT_NAME>
- <PATH>/reassign-lease-test</PATH>
+ <TRANSACTION_ID>57</TRANSACTION_ID>
+ <CLIENT_NAME>DFSClient_NONMAPREDUCE_-2143415023_1</CLIENT_NAME>
+ <PATH>/hard-lease-recovery-test</PATH>
<CLIENT_NAME>HDFS_NameNode</CLIENT_NAME>
</DATA>
- <CHECKSUM>1975140107</CHECKSUM>
+ <CHECKSUM>-1841690515</CHECKSUM>
+ </RECORD>
+ <RECORD>
+ <OPCODE>9</OPCODE>
+ <DATA>
+ <TRANSACTION_ID>58</TRANSACTION_ID>
+ <PATH>/hard-lease-recovery-test</PATH>
+ <REPLICATION>1</REPLICATION>
+ <MTIME>1330405688726</MTIME>
+ <ATIME>1330405686084</ATIME>
+ <BLOCKSIZE>512</BLOCKSIZE>
+ <NUMBLOCKS>1</NUMBLOCKS>
+ <BLOCK>
+ <BLOCK_ID>-357061736603024522</BLOCK_ID>
+ <BLOCK_NUM_BYTES>11</BLOCK_NUM_BYTES>
+ <BLOCK_GENERATION_STAMP>1017</BLOCK_GENERATION_STAMP>
+ </BLOCK>
+ <PERMISSION_STATUS>
+ <USERNAME>todd</USERNAME>
+ <GROUPNAME>supergroup</GROUPNAME>
+ <FS_PERMISSIONS>420</FS_PERMISSIONS>
+ </PERMISSION_STATUS>
+ </DATA>
+ <CHECKSUM>-218102037</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>23</OPCODE>
<DATA>
- <TRANSACTION_ID>35</TRANSACTION_ID>
+ <TRANSACTION_ID>59</TRANSACTION_ID>
</DATA>
- <CHECKSUM>1975140107</CHECKSUM>
+ <CHECKSUM>-1616653774</CHECKSUM>
</RECORD>
<RECORD>
<OPCODE>-1</OPCODE>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz
new file mode 100644
index 0000000..8e327c2
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-1.0-multiblock-file.tgz
Binary files differ
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml
index 0f5310c..eb3f4bd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/hadoop-policy.xml
@@ -109,5 +109,12 @@
group list is separated by a blank. For e.g. "alice,bob users,wheel".
A special value of "*" means all users are allowed.</description>
</property>
-
+
+ <property>
+ <name>security.ha.service.protocol.acl</name>
+ <value>*</value>
+ <description>ACL for HAService protocol used by HAAdmin to manage the
+ active and stand-by states of namenode.</description>
+ </property>
+
</configuration>
diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh
index 11d7022..e644bbf 100755
--- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh
+++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-common/scripts/saveVersion.sh
@@ -34,7 +34,7 @@
url="git://${hostname}${cwd}"
elif [ -d .svn ]; then
revision=`svn info ../ | sed -n -e 's/Last Changed Rev: \(.*\)/\1/p'`
- url=`svn info ../ | sed -n -e 's/URL: \(.*\)/\1/p'`
+ url=`svn info ../ | sed -n -e 's/^URL: \(.*\)/\1/p'`
# Get canonical branch (branches/X, tags/X, or trunk)
branch=`echo $url | sed -n -e 's,.*\(branches/.*\)$,\1,p' \
-e 's,.*\(tags/.*\)$,\1,p' \
diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm
new file mode 100644
index 0000000..c665067
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/HDFSHighAvailability.apt.vm
@@ -0,0 +1,434 @@
+~~ Licensed under the Apache License, Version 2.0 (the "License");
+~~ you may not use this file except in compliance with the License.
+~~ You may obtain a copy of the License at
+~~
+~~ http://www.apache.org/licenses/LICENSE-2.0
+~~
+~~ Unless required by applicable law or agreed to in writing, software
+~~ distributed under the License is distributed on an "AS IS" BASIS,
+~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~~ See the License for the specific language governing permissions and
+~~ limitations under the License. See accompanying LICENSE file.
+
+ ---
+ Hadoop Distributed File System-${project.version} - High Availability
+ ---
+ ---
+ ${maven.build.timestamp}
+
+HDFS High Availability
+
+ \[ {{{./index.html}Go Back}} \]
+
+%{toc|section=1|fromDepth=0}
+
+* {Purpose}
+
+ This guide provides an overview of the HDFS High Availability (HA) feature and
+ how to configure and manage an HA HDFS cluster.
+
+ This document assumes that the reader has a general understanding of
+ general components and node types in an HDFS cluster. Please refer to the
+ HDFS Architecture guide for details.
+
+* {Background}
+
+ Prior to Hadoop 0.23.2, the NameNode was a single point of failure (SPOF) in
+ an HDFS cluster. Each cluster had a single NameNode, and if that machine or
+ process became unavailable, the cluster as a whole would be unavailable
+ until the NameNode was either restarted or brought up on a separate machine.
+
+ This impacted the total availability of the HDFS cluster in two major ways:
+
+ * In the case of an unplanned event such as a machine crash, the cluster would
+ be unavailable until an operator restarted the NameNode.
+
+ * Planned maintenance events such as software or hardware upgrades on the
+ NameNode machine would result in windows of cluster downtime.
+
+ The HDFS High Availability feature addresses the above problems by providing
+ the option of running two redundant NameNodes in the same cluster in an
+ Active/Passive configuration with a hot standby. This allows a fast failover to
+ a new NameNode in the case that a machine crashes, or a graceful
+ administrator-initiated failover for the purpose of planned maintenance.
+
+* {Architecture}
+
+ In a typical HA cluster, two separate machines are configured as NameNodes.
+ At any point in time, exactly one of the NameNodes is in an <Active> state,
+ and the other is in a <Standby> state. The Active NameNode is responsible
+ for all client operations in the cluster, while the Standby is simply acting
+ as a slave, maintaining enough state to provide a fast failover if
+ necessary.
+
+ In order for the Standby node to keep its state synchronized with the Active
+ node, the current implementation requires that the two nodes both have access
+ to a directory on a shared storage device (eg an NFS mount from a NAS). This
+ restriction will likely be relaxed in future versions.
+
+ When any namespace modification is performed by the Active node, it durably
+ logs a record of the modification to an edit log file stored in the shared
+ directory. The Standby node is constantly watching this directory for edits,
+ and as it sees the edits, it applies them to its own namespace. In the event of
+ a failover, the Standby will ensure that it has read all of the edits from the
+ shared storage before promoting itself to the Active state. This ensures that
+ the namespace state is fully synchronized before a failover occurs.
+
+ In order to provide a fast failover, it is also necessary that the Standby node
+ have up-to-date information regarding the location of blocks in the cluster.
+ In order to achieve this, the DataNodes are configured with the location of
+ both NameNodes, and send block location information and heartbeats to both.
+
+ It is vital for the correct operation of an HA cluster that only one of the
+ NameNodes be Active at a time. Otherwise, the namespace state would quickly
+ diverge between the two, risking data loss or other incorrect results. In
+ order to ensure this property and prevent the so-called "split-brain scenario,"
+ the administrator must configure at least one <fencing method> for the shared
+ storage. During a failover, if it cannot be verified that the previous Active
+ node has relinquished its Active state, the fencing process is responsible for
+ cutting off the previous Active's access to the shared edits storage. This
+ prevents it from making any further edits to the namespace, allowing the new
+ Active to safely proceed with failover.
+
+ <<Note:>> Currently, only manual failover is supported. This means the HA
+ NameNodes are incapable of automatically detecting a failure of the Active
+ NameNode, and instead rely on the operator to manually initiate a failover.
+ Automatic failure detection and initiation of a failover will be implemented in
+ future versions.
+
+* {Hardware resources}
+
+ In order to deploy an HA cluster, you should prepare the following:
+
+ * <<NameNode machines>> - the machines on which you run the Active and
+ Standby NameNodes should have equivalent hardware to each other, and
+ equivalent hardware to what would be used in a non-HA cluster.
+
+ * <<Shared storage>> - you will need to have a shared directory which both
+ NameNode machines can have read/write access to. Typically this is a remote
+ filer which supports NFS and is mounted on each of the NameNode machines.
+ Currently only a single shared edits directory is supported. Thus, the
+ availability of the system is limited by the availability of this shared edits
+ directory, and therefore in order to remove all single points of failure there
+ needs to be redundancy for the shared edits directory. Specifically, multiple
+ network paths to the storage, and redundancy in the storage itself (disk,
+ network, and power). Beacuse of this, it is recommended that the shared storage
+ server be a high-quality dedicated NAS appliance rather than a simple Linux
+ server.
+
+ Note that, in an HA cluster, the Standby NameNode also performs checkpoints of
+ the namespace state, and thus it is not necessary to run a Secondary NameNode,
+ CheckpointNode, or BackupNode in an HA cluster. In fact, to do so would be an
+ error. This also allows one who is reconfiguring a non-HA-enabled HDFS cluster
+ to be HA-enabled to reuse the hardware which they had previously dedicated to
+ the Secondary NameNode.
+
+* {Deployment}
+
+** Configuration overview
+
+ Similar to Federation configuration, HA configuration is backward compatible
+ and allows existing single NameNode configurations to work without change.
+ The new configuration is designed such that all the nodes in the cluster may
+ have the same configuration without the need for deploying different
+ configuration files to different machines based on the type of the node.
+
+ Like HDFS Federation, HA clusters reuse the <<<nameservice ID>>> to identify a
+ single HDFS instance that may in fact consist of multiple HA NameNodes. In
+ addition, a new abstraction called <<<NameNode ID>>> is added with HA. Each
+ distinct NameNode in the cluster has a different NameNode ID to distinguish it.
+ To support a single configuration file for all of the NameNodes, the relevant
+ configuration parameters are suffixed with the <<nameservice ID>> as well as
+ the <<NameNode ID>>.
+
+** Configuration details
+
+ To configure HA NameNodes, you must add several configuration options to your
+ <<hdfs-site.xml>> configuration file.
+
+ The order in which you set these configurations is unimportant, but the values
+ you choose for <<dfs.federation.nameservices>> and
+ <<dfs.ha.namenodes.[nameservice ID]>> will determine the keys of those that
+ follow. Thus, you should decide on these values before setting the rest of the
+ configuration options.
+
+ * <<dfs.federation.nameservices>> - the logical name for this new nameservice
+
+ Choose a logical name for this nameservice, for example "mycluster", and use
+ this logical name for the value of this config option. The name you choose is
+ arbitrary. It will be used both for configuration and as the authority
+ component of absolute HDFS paths in the cluster.
+
+ <<Note:>> If you are also using HDFS Federation, this configuration setting
+ should also include the list of other nameservices, HA or otherwise, as a
+ comma-separated list.
+
+----
+<property>
+ <name>dfs.federation.nameservices</name>
+ <value>mycluster</value>
+</property>
+----
+
+ * <<dfs.ha.namenodes.[nameservice ID]>> - unique identifiers for each NameNode in the nameservice
+
+ Configure with a list of comma-separated NameNode IDs. This will be used by
+ DataNodes to determine all the NameNodes in the cluster. For example, if you
+ used "mycluster" as the nameservice ID previously, and you wanted to use "nn1"
+ and "nn2" as the individual IDs of the NameNodes, you would configure this as
+ such:
+
+----
+<property>
+ <name>dfs.ha.namenodes.mycluster</name>
+ <value>nn1,nn2</value>
+</property>
+----
+
+ <<Note:>> Currently, only a maximum of two NameNodes may be configured per
+ nameservice.
+
+ * <<dfs.namenode.rpc-address.[nameservice ID].[name node ID]>> - the fully-qualified RPC address for each NameNode to listen on
+
+ For both of the previously-configured NameNode IDs, set the full address and
+ IPC port of the NameNode processs. Note that this results in two separate
+ configuration options. For example:
+
+----
+<property>
+ <name>dfs.namenode.rpc-address.mycluster.nn1</name>
+ <value>machine1.example.com:8020</value>
+</property>
+<property>
+ <name>dfs.namenode.rpc-address.mycluster.nn2</name>
+ <value>machine2.example.com:8020</value>
+</property>
+----
+
+ <<Note:>> You may similarly configure the "<<servicerpc-address>>" setting if
+ you so desire.
+
+ * <<dfs.namenode.http-address.[nameservice ID].[name node ID]>> - the fully-qualified HTTP address for each NameNode to listen on
+
+ Similarly to <rpc-address> above, set the addresses for both NameNodes' HTTP
+ servers to listen on. For example:
+
+----
+<property>
+ <name>dfs.namenode.http-address.mycluster.nn1</name>
+ <value>machine1.example.com:50070</value>
+</property>
+<property>
+ <name>dfs.namenode.http-address.mycluster.nn2</name>
+ <value>machine2.example.com:50070</value>
+</property>
+----
+
+ <<Note:>> If you have Hadoop's security features enabled, you should also set
+ the <https-address> similarly for each NameNode.
+
+ * <<dfs.namenode.shared.edits.dir>> - the location of the shared storage directory
+
+ This is where one configures the path to the remote shared edits directory
+ which the Standby NameNode uses to stay up-to-date with all the file system
+ changes the Active NameNode makes. <<You should only configure one of these
+ directories.>> This directory should be mounted r/w on both NameNode machines.
+ The value of this setting should be the absolute path to this directory on the
+ NameNode machines. For example:
+
+----
+<property>
+ <name>dfs.namenode.shared.edits.dir</name>
+ <value>file:///mnt/filer1/dfs/ha-name-dir-shared</value>
+</property>
+----
+
+ * <<dfs.client.failover.proxy.provider.[nameservice ID]>> - the Java class that HDFS clients use to contact the Active NameNode
+
+ Configure the name of the Java class which will be used by the DFS Client to
+ determine which NameNode is the current Active, and therefore which NameNode is
+ currently serving client requests. The only implementation which currently
+ ships with Hadoop is the <<ConfiguredFailoverProxyProvider>>, so use this
+ unless you are using a custom one. For example:
+
+----
+<property>
+ <name>dfs.client.failover.proxy.provider.mycluster</name>
+ <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
+</property>
+----
+
+ * <<dfs.ha.fencing.methods>> - a list of scripts or Java classes which will be used to fence the Active NameNode during a failover
+
+ It is critical for correctness of the system that only one NameNode be in the
+ Active state at any given time. Thus, during a failover, we first ensure that
+ the Active NameNode is either in the Standby state, or the process has
+ terminated, before transitioning the other NameNode to the Active state. In
+ order to do this, you must configure at least one <<fencing method.>> These are
+ configured as a carriage-return-separated list, which will be attempted in order
+ until one indicates that fencing has succeeded. There are two methods which
+ ship with Hadoop: <shell> and <sshfence>. For information on implementing
+ your own custom fencing method, see the <org.apache.hadoop.ha.NodeFencer> class.
+
+ * <<sshfence>> - SSH to the Active NameNode and kill the process
+
+ The <sshfence> option SSHes to the target node and uses <fuser> to kill the
+ process listening on the service's TCP port. In order for this fencing option
+ to work, it must be able to SSH to the target node without providing a
+ passphrase. Thus, one must also configure the
+ <<dfs.ha.fencing.ssh.private-key-files>> option, which is a
+ comma-separated list of SSH private key files. For example:
+
+---
+<property>
+ <name>dfs.ha.fencing.methods</name>
+ <value>sshfence</value>
+</property>
+
+<property>
+ <name>dfs.ha.fencing.ssh.private-key-files</name>
+ <value>/home/exampleuser/.ssh/id_rsa</value>
+</property>
+---
+
+ Optionally, one may configure a non-standard username or port to perform the
+ SSH. One may also configure a timeout, in milliseconds, for the SSH, after
+ which this fencing method will be considered to have failed. It may be
+ configured like so:
+
+---
+<property>
+ <name>dfs.ha.fencing.methods</name>
+ <value>sshfence([[username][:port]])</value>
+</property>
+<property>
+ <name>dfs.ha.fencing.ssh.connect-timeout</name>
+ <value>
+</property>
+---
+
+ * <<shell>> - run an arbitrary shell command to fence the Active NameNode
+
+ The <shell> fencing method runs an arbitrary shell command. It may be
+ configured like so:
+
+---
+<property>
+ <name>dfs.ha.fencing.methods</name>
+ <value>shell(/path/to/my/script.sh arg1 arg2 ...)</value>
+</property>
+---
+
+ The string between '(' and ')' is passed directly to a bash shell and may not
+ include any closing parentheses.
+
+ When executed, the first argument to the configured script will be the address
+ of the NameNode to be fenced, followed by all arguments specified in the
+ configuration.
+
+ The shell command will be run with an environment set up to contain all of the
+ current Hadoop configuration variables, with the '_' character replacing any
+ '.' characters in the configuration keys. If the shell command returns an exit
+ code of 0, the fencing is determined to be successful. If it returns any other
+ exit code, the fencing was not successful and the next fencing method in the
+ list will be attempted.
+
+ <<Note:>> This fencing method does not implement any timeout. If timeouts are
+ necessary, they should be implemented in the shell script itself (eg by forking
+ a subshell to kill its parent in some number of seconds).
+
+ * <<fs.defaultFS>> - the default path prefix used by the Hadoop FS client when none is given
+
+ Optionally, you may now configure the default path for Hadoop clients to use
+ the new HA-enabled logical URI. If you used "mycluster" as the nameservice ID
+ earlier, this will be the value of the authority portion of all of your HDFS
+ paths. This may be configured like so, in your <<core-site.xml>> file:
+
+---
+<property>
+ <name>fs.defaultFS</name>
+ <value>hdfs://mycluster</value>
+</property>
+---
+
+** Deployment details
+
+ After all of the necessary configuration options have been set, one must
+ initially synchronize the two HA NameNodes' on-disk metadata. If you are
+ setting up a fresh HDFS cluster, you should first run the format command (<hdfs
+ namenode -format>) on one of NameNodes. If you have already formatted the
+ NameNode, or are converting a non-HA-enabled cluster to be HA-enabled, you
+ should now copy over the contents of your NameNode metadata directories to
+ the other, unformatted NameNode using <scp> or a similar utility. The location
+ of the directories containing the NameNode metadata are configured via the
+ configuration options <<dfs.namenode.name.dir>> and/or
+ <<dfs.namenode.edits.dir>>. At this time, you should also ensure that the
+ shared edits dir (as configured by <<dfs.namenode.shared.edits.dir>>) includes
+ all recent edits files which are in your NameNode metadata directories.
+
+ At this point you may start both of your HA NameNodes as you normally would
+ start a NameNode.
+
+ You can visit each of the NameNodes' web pages separately by browsing to their
+ configured HTTP addresses. You should notice that next to the configured
+ address will be the HA state of the NameNode (either "standby" or "active".)
+ Whenever an HA NameNode starts, it is initially in the Standby state.
+
+** Administrative commands
+
+ Now that your HA NameNodes are configured and started, you will have access
+ to some additional commands to administer your HA HDFS cluster. Specifically,
+ you should familiarize yourself with all of the subcommands of the "<hdfs
+ haadmin>" command. Running this command without any additional arguments will
+ display the following usage information:
+
+---
+Usage: DFSHAAdmin [-ns <nameserviceId>]
+ [-transitionToActive <serviceId>]
+ [-transitionToStandby <serviceId>]
+ [-failover [--forcefence] [--forceactive] <serviceId> <serviceId>]
+ [-getServiceState <serviceId>]
+ [-checkHealth <serviceId>]
+ [-help <command>]
+---
+
+ This guide describes high-level uses of each of these subcommands. For
+ specific usage information of each subcommand, you should run "<hdfs haadmin
+ -help <command>>".
+
+ * <<transitionToActive>> and <<transitionToStandby>> - transition the state of the given NameNode to Active or Standby
+
+ These subcommands cause a given NameNode to transition to the Active or Standby
+ state, respectively. <<These commands do not attempt to perform any fencing,
+ and thus should rarely be used.>> Instead, one should almost always prefer to
+ use the "<hdfs haadmin -failover>" subcommand.
+
+ * <<failover>> - initiate a failover between two NameNodes
+
+ This subcommand causes a failover from the first provided NameNode to the
+ second. If the first NameNode is in the Standby state, this command simply
+ transitions the second to the Active state without error. If the first NameNode
+ is in the Active state, an attempt will be made to gracefully transition it to
+ the Standby state. If this fails, the fencing methods (as configured by
+ <<dfs.ha.fencing.methods>>) will be attempted in order until one
+ succeeds. Only after this process will the second NameNode be transitioned to
+ the Active state. If no fencing method succeeds, the second NameNode will not
+ be transitioned to the Active state, and an error will be returned.
+
+ * <<getServiceState>> - determine whether the given NameNode is Active or Standby
+
+ Connect to the provided NameNode to determine its current state, printing
+ either "standby" or "active" to STDOUT appropriately. This subcommand might be
+ used by cron jobs or monitoring scripts which need to behave differently based
+ on whether the NameNode is currently Active or Standby.
+
+ * <<checkHealth>> - check the health of the given NameNode
+
+ Connect to the provided NameNode to check its health. The NameNode is capable
+ of performing some diagnostics on itself, including checking if internal
+ services are running as expected. This command will return 0 if the NameNode is
+ healthy, non-zero otherwise. One might use this command for monitoring
+ purposes.
+
+ <<Note:>> This is not yet implemented, and at present will always return
+ success, unless the given NameNode is completely down.
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index 93e9742..1cf5a72 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -593,6 +593,11 @@
<version>${commons-daemon.version}</version>
</dependency>
<dependency>
+ <groupId>com.jcraft</groupId>
+ <artifactId>jsch</artifactId>
+ <version>0.1.42</version>
+ </dependency>
+ <dependency>
<groupId>org.jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml
index 27f9b7b..f992a17 100644
--- a/hadoop-project/src/site/site.xml
+++ b/hadoop-project/src/site/site.xml
@@ -53,6 +53,7 @@
</menu>
<menu name="HDFS" inherit="top">
+ <item name="High Availability" href="hadoop-yarn/hadoop-yarn-site/HDFSHighAvailability.html"/>
<item name="Federation" href="hadoop-yarn/hadoop-yarn-site/Federation.html"/>
<item name="WebHDFS REST API" href="hadoop-yarn/hadoop-yarn-site/WebHDFS.html"/>
<item name="HttpFS Gateway" href="hadoop-hdfs-httpfs/index.html"/>
diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java
index 1d248f0..563372e 100644
--- a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java
+++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java
@@ -22,7 +22,9 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.retry.RetryPolicy;
+import org.apache.hadoop.io.retry.RetryPolicy.RetryAction;
import org.apache.hadoop.io.retry.RetryPolicies;
+import org.apache.hadoop.util.ThreadUtil;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
@@ -80,7 +82,7 @@
public Object execute(Object... arguments) throws Exception {
Exception latestException;
int counter = 0;
- do {
+ while (true) {
try {
return doExecute(arguments);
} catch(Exception exception) {
@@ -88,7 +90,13 @@
latestException = exception;
}
counter++;
- } while (retryPolicy.shouldRetry(latestException, counter, 0, true).equals(RetryPolicy.RetryAction.RETRY));
+ RetryAction action = retryPolicy.shouldRetry(latestException, counter, 0, true);
+ if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY) {
+ ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis);
+ } else {
+ break;
+ }
+ }
throw new IOException("Couldn't run retriable-command: " + description,
latestException);
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
index e5ab059..5ba5eb8 100644
--- a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
+++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java
@@ -545,7 +545,12 @@
Assert.fail("Didn't expect the file to be copied");
} catch (AccessControlException ignore) {
} catch (Exception e) {
- if (e.getCause() == null || !(e.getCause() instanceof AccessControlException)) {
+ // We want to make sure the underlying cause of the exception is
+ // due to permissions error. The exception we're interested in is
+ // wrapped twice - once in RetriableCommand and again in CopyMapper
+ // itself.
+ if (e.getCause() == null || e.getCause().getCause() == null ||
+ !(e.getCause().getCause() instanceof AccessControlException)) {
throw new RuntimeException(e);
}
}