HDDS-6959. Fix move timeout in latest iteration metric (#3562)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index 49dd12b..a77f7a8 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -635,6 +635,8 @@
future = replicationManager
.move(containerID, source, moveSelection.getTargetNode())
.whenComplete((result, ex) -> {
+
+ metrics.incrementCurrentIterationContainerMoveMetric(result, 1);
if (ex != null) {
LOG.info("Container move for container {} from source {} to " +
"target {} failed with exceptions {}",
@@ -645,7 +647,6 @@
if (result == LegacyReplicationManager.MoveResult.COMPLETED) {
metrics.incrementDataSizeMovedGBInLatestIteration(
containerInfo.getUsedBytes() / OzoneConsts.GB);
- metrics.incrementNumContainerMovesCompletedInLatestIteration(1);
if (LOG.isDebugEnabled()) {
LOG.debug(
"Container move completed for container {} to target {}",
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index 3a7ce49..b135c1c 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hdds.scm.container.balancer;
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
@@ -121,6 +122,42 @@
this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
}
+ public void incrementCurrentIterationContainerMoveMetric(
+ MoveResult result,
+ long valueToAdd) {
+ if (result == null) {
+ return;
+ }
+ switch (result) {
+ case COMPLETED:
+ this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
+ break;
+ case REPLICATION_FAIL_TIME_OUT:
+ case DELETION_FAIL_TIME_OUT:
+ this.numContainerMovesTimeoutInLatestIteration.incr(valueToAdd);
+ break;
+ // TODO: Add metrics for other errors that need to be tracked.
+ case FAIL_NOT_RUNNING:
+ case REPLICATION_FAIL_INFLIGHT_REPLICATION:
+ case FAIL_NOT_LEADER:
+ case REPLICATION_FAIL_NOT_EXIST_IN_SOURCE:
+ case REPLICATION_FAIL_EXIST_IN_TARGET:
+ case REPLICATION_FAIL_CONTAINER_NOT_CLOSED:
+ case REPLICATION_FAIL_INFLIGHT_DELETION:
+ case REPLICATION_FAIL_NODE_NOT_IN_SERVICE:
+ case DELETION_FAIL_NODE_NOT_IN_SERVICE:
+ case REPLICATION_FAIL_NODE_UNHEALTHY:
+ case DELETION_FAIL_NODE_UNHEALTHY:
+ case DELETE_FAIL_POLICY:
+ case PLACEMENT_POLICY_NOT_SATISFIED:
+ case UNEXPECTED_REMOVE_SOURCE_AT_INFLIGHT_REPLICATION:
+ case UNEXPECTED_REMOVE_TARGET_AT_INFLIGHT_DELETION:
+ case FAIL_CAN_NOT_RECORD_TO_DB:
+ default:
+ break;
+ }
+ }
+
public void resetNumContainerMovesCompletedInLatestIteration() {
numContainerMovesCompletedInLatestIteration.incr(
-getNumContainerMovesCompletedInLatestIteration());
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 5d4da10..fc643af 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -458,7 +458,7 @@
/**
- * following functions will be refactored in a seperate jira.
+ * following functions will be refactored in a separate jira.
*/
public CompletableFuture<LegacyReplicationManager.MoveResult> move(
ContainerID cid, DatanodeDetails src, DatanodeDetails tgt)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index d57226b..dd27e8f 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -740,6 +740,37 @@
}
@Test
+ public void checkIterationResultTimeoutFromReplicationManager()
+ throws NodeNotFoundException, IOException,
+ IllegalContainerBalancerStateException,
+ InvalidContainerBalancerConfigurationException {
+ CompletableFuture<MoveResult> future
+ = CompletableFuture.supplyAsync(() ->
+ MoveResult.REPLICATION_FAIL_TIME_OUT);
+ CompletableFuture<MoveResult> future2
+ = CompletableFuture.supplyAsync(() ->
+ MoveResult.DELETION_FAIL_TIME_OUT);
+ Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+ Mockito.any(DatanodeDetails.class),
+ Mockito.any(DatanodeDetails.class)))
+ .thenReturn(future, future2);
+
+ balancerConfiguration.setThreshold(10);
+ balancerConfiguration.setIterations(1);
+ balancerConfiguration.setMaxSizeEnteringTarget(10 * OzoneConsts.GB);
+ balancerConfiguration.setMaxSizeToMovePerIteration(100 * OzoneConsts.GB);
+ balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+ balancerConfiguration.setMoveTimeout(Duration.ofMillis(1000));
+
+ startBalancer(balancerConfiguration);
+ sleepWhileBalancing(2000);
+
+ Assertions.assertTrue(containerBalancer.getMetrics()
+ .getNumContainerMovesTimeoutInLatestIteration() > 0);
+ stopBalancer();
+ }
+
+ @Test
public void testStartAndImmediateStopForDeadlock()
throws IllegalContainerBalancerStateException, IOException,
InvalidContainerBalancerConfigurationException {