add new error message for customized partition check host connection error  (#1984)


Add new error message for customized partition check host connection error

diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
index 5939e19..d35fdb4 100644
--- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
+++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
@@ -55,6 +55,7 @@
       ImmutableSet.of(HelixDefinedState.DROPPED.name(), HelixDefinedState.ERROR.name());
 
   static final String UNHEALTHY_PARTITION = "UNHEALTHY_PARTITION";
+  static final String HOST_NO_STATE_ERROR = "HOST_NO_STATE_ERROR:";
   // The message that will be shown if partition is in initial state of the state model and
   // partition health check has been skipped for that instance
   static final String PARTITION_INITIAL_STATE_FAIL = "PARTITION_INITIAL_STATE_FAIL";
@@ -257,11 +258,16 @@
               continue;
             }
 
-            // We are checking sibling partition healthy status. So if partition health does not
-            // exist or it is not healthy. We should mark this partition is unhealthy.
+            // If we failed to get partition assignment for one sibling instance, we add the
+            // instance name in return error for debuggability.
             if (!globalPartitionHealthStatus.containsKey(siblingInstance)
-                || !globalPartitionHealthStatus.get(siblingInstance).containsKey(partition)
-                || !globalPartitionHealthStatus.get(siblingInstance).get(partition)) {
+                || globalPartitionHealthStatus.get(siblingInstance).isEmpty()) {
+              unhealthyPartitions.computeIfAbsent(partition, list -> new ArrayList<>())
+                  .add(HOST_NO_STATE_ERROR + siblingInstance);
+            } else if (globalPartitionHealthStatus.get(siblingInstance)
+                .getOrDefault(partition, false)) {
+              // We are checking sibling partition healthy status. So if partition health does not
+              // exist or it is not healthy. We should mark this partition is unhealthy.
               unhealthyPartitions.computeIfAbsent(partition, list -> new ArrayList<>())
                   .add(UNHEALTHY_PARTITION);
             }
diff --git a/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java b/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
index a911286..c9ed4b1 100644
--- a/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
+++ b/helix-rest/src/test/java/org/apache/helix/rest/clusterMaintenanceService/TestMaintenanceManagementService.java
@@ -242,12 +242,12 @@
             MaintenanceManagementService.getMapFromJsonPayload(jsonContent), Collections.singletonList("org.apache.helix.rest.server.TestOperationImpl"),
             Collections.EMPTY_MAP, true);
     Assert.assertFalse(instanceInfo.isSuccessful());
-    Assert.assertEquals(instanceInfo.getMessages().get(0), "CUSTOM_PARTITION_HEALTH_FAILURE:UNHEALTHY_PARTITION:PARTITION_0");
+    Assert.assertEquals(instanceInfo.getMessages().get(0), "CUSTOM_PARTITION_HEALTH_FAILURE:HOST_NO_STATE_ERROR:INSTANCE0.LINKEDIN.COM_1236:PARTITION_0");
 
     // Operation should finish even with check failed.
     MockMaintenanceManagementService instanceServiceSkipFailure =
         new MockMaintenanceManagementService(zkHelixDataAccessor, _configAccessor, _customRestClient, true,
-            ImmutableSet.of("CUSTOM_PARTITION_HEALTH_FAILURE:UNHEALTHY_PARTITION"), HelixRestNamespace.DEFAULT_NAMESPACE_NAME);
+            ImmutableSet.of("CUSTOM_PARTITION_HEALTH_FAILURE:HOST_NO_STATE_ERROR"), HelixRestNamespace.DEFAULT_NAMESPACE_NAME);
     MaintenanceManagementInstanceInfo instanceInfo2 =
         instanceServiceSkipFailure.takeInstance(TEST_CLUSTER, TEST_INSTANCE, Collections.singletonList("CustomInstanceStoppableCheck"),
             MaintenanceManagementService.getMapFromJsonPayload(jsonContent), Collections.singletonList("org.apache.helix.rest.server.TestOperationImpl"),