add new error message for customized partition check host connection error (#1984)
Add new error message for customized partition check host connection error
diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
index 5939e19..d35fdb4 100644
--- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
+++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
@@ -55,6 +55,7 @@
ImmutableSet.of(HelixDefinedState.DROPPED.name(), HelixDefinedState.ERROR.name());
static final String UNHEALTHY_PARTITION = "UNHEALTHY_PARTITION";
+ static final String HOST_NO_STATE_ERROR = "HOST_NO_STATE_ERROR:";
// The message that will be shown if partition is in initial state of the state model and
// partition health check has been skipped for that instance
static final String PARTITION_INITIAL_STATE_FAIL = "PARTITION_INITIAL_STATE_FAIL";
@@ -257,11 +258,16 @@
continue;
}
- // We are checking sibling partition healthy status. So if partition health does not
- // exist or it is not healthy. We should mark this partition is unhealthy.
+ // If we failed to get partition assignment for one sibling instance, we add the
+ // instance name in return error for debuggability.
if (!globalPartitionHealthStatus.containsKey(siblingInstance)
- || !globalPartitionHealthStatus.get(siblingInstance).containsKey(partition)
- || !globalPartitionHealthStatus.get(siblingInstance).get(partition)) {
+ || globalPartitionHealthStatus.get(siblingInstance).isEmpty()) {
+ unhealthyPartitions.computeIfAbsent(partition, list -> new ArrayList<>())
+ .add(HOST_NO_STATE_ERROR + siblingInstance);
+ } else if (globalPartitionHealthStatus.get(siblingInstance)
+ .getOrDefault(partition, false)) {
+ // We are checking sibling partition healthy status. So if partition health does not
+ // exist or it is not healthy. We should mark this partition is unhealthy.
unhealthyPartitions.computeIfAbsent(partition, list -> new ArrayList<>())
.add(UNHEALTHY_PARTITION);
}
diff --git a/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java b/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
index a911286..c9ed4b1 100644
--- a/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
+++ b/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
@@ -242,12 +242,12 @@
MaintenanceManagementService.getMapFromJsonPayload(jsonContent), Collections.singletonList("org.apache.helix.rest.server.TestOperationImpl"),
Collections.EMPTY_MAP, true);
Assert.assertFalse(instanceInfo.isSuccessful());
- Assert.assertEquals(instanceInfo.getMessages().get(0), "CUSTOM_PARTITION_HEALTH_FAILURE:UNHEALTHY_PARTITION:PARTITION_0");
+ Assert.assertEquals(instanceInfo.getMessages().get(0), "CUSTOM_PARTITION_HEALTH_FAILURE:HOST_NO_STATE_ERROR:INSTANCE0.LINKEDIN.COM_1236:PARTITION_0");
// Operation should finish even with check failed.
MockMaintenanceManagementService instanceServiceSkipFailure =
new MockMaintenanceManagementService(zkHelixDataAccessor, _configAccessor, _customRestClient, true,
- ImmutableSet.of("CUSTOM_PARTITION_HEALTH_FAILURE:UNHEALTHY_PARTITION"), HelixRestNamespace.DEFAULT_NAMESPACE_NAME);
+ ImmutableSet.of("CUSTOM_PARTITION_HEALTH_FAILURE:HOST_NO_STATE_ERROR"), HelixRestNamespace.DEFAULT_NAMESPACE_NAME);
MaintenanceManagementInstanceInfo instanceInfo2 =
instanceServiceSkipFailure.takeInstance(TEST_CLUSTER, TEST_INSTANCE, Collections.singletonList("CustomInstanceStoppableCheck"),
MaintenanceManagementService.getMapFromJsonPayload(jsonContent), Collections.singletonList("org.apache.helix.rest.server.TestOperationImpl"),