Fix management mode history duplicate recording (#1846)

The management mode history has duplicate entries. It does not impact the normal function, but it's good to get it fixed to avoid confusion. This commit fixes the issue by adding a check for the status in metadata store and the calculated status.
diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
index 3329d8c..4aa09d0 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
@@ -86,9 +86,7 @@
         checkClusterFreezeStatus(cache.getEnabledLiveInstances(), cache.getLiveInstances(),
             cache.getAllInstancesMessages(), cache.getPauseSignal());
 
-    recordClusterStatus(managementMode, accessor);
-    recordManagementModeHistory(managementMode, cache.getPauseSignal(), manager.getInstanceName(),
-        accessor);
+    recordClusterStatus(managementMode, cache.getPauseSignal(), manager.getInstanceName(), accessor);
 
     event.addAttribute(AttributeName.CLUSTER_STATUS.name(), managementMode);
   }
@@ -135,36 +133,31 @@
         .anyMatch(message -> PENDING_MESSAGE_TYPES.contains(message.getMsgType()));
   }
 
-  private void recordClusterStatus(ClusterManagementMode mode, HelixDataAccessor accessor) {
-    // update cluster status
+  private void recordClusterStatus(ClusterManagementMode mode, PauseSignal pauseSignal,
+      String controllerName, HelixDataAccessor accessor) {
+    // Read cluster status from metadata store
     PropertyKey statusPropertyKey = accessor.keyBuilder().clusterStatus();
     ClusterStatus clusterStatus = accessor.getProperty(statusPropertyKey);
     if (clusterStatus == null) {
       clusterStatus = new ClusterStatus();
     }
 
-    ClusterManagementMode.Type recordedType = clusterStatus.getManagementMode();
-    ClusterManagementMode.Status recordedStatus = clusterStatus.getManagementModeStatus();
-
-    // If there is any pending message sent by users, status could be computed as in progress.
-    // Skip recording status change to avoid confusion after cluster is already fully frozen.
-    if (ClusterManagementMode.Type.CLUSTER_FREEZE.equals(recordedType)
-        && ClusterManagementMode.Status.COMPLETED.equals(recordedStatus)
-        && ClusterManagementMode.Type.CLUSTER_FREEZE.equals(mode.getMode())
-        && ClusterManagementMode.Status.IN_PROGRESS.equals(mode.getStatus())) {
-      LOG.info("Skip recording status mode={}, status={}, because cluster is fully frozen",
-          mode.getMode(), mode.getStatus());
+    if (mode.getMode().equals(clusterStatus.getManagementMode())
+        && mode.getStatus().equals(clusterStatus.getManagementModeStatus())) {
+      // No need to update status and avoid duplicates when it's the same as metadata store
+      LOG.debug("Skip recording duplicate status mode={}, status={}", mode.getMode(),
+          mode.getStatus());
       return;
     }
 
-    if (!mode.getMode().equals(recordedType) || !mode.getStatus().equals(recordedStatus)) {
-      // Only update status when it's different with metadata store
-      clusterStatus.setManagementMode(mode.getMode());
-      clusterStatus.setManagementModeStatus(mode.getStatus());
-      if (!accessor.updateProperty(statusPropertyKey, clusterStatus)) {
-        LOG.error("Failed to update cluster status {}", clusterStatus);
-      }
+    // Update cluster status in metadata store
+    clusterStatus.setManagementMode(mode.getMode());
+    clusterStatus.setManagementModeStatus(mode.getStatus());
+    if (!accessor.updateProperty(statusPropertyKey, clusterStatus)) {
+      LOG.error("Failed to update cluster status {}", clusterStatus);
     }
+
+    recordManagementModeHistory(mode, pauseSignal, controllerName, accessor);
   }
 
   private void recordManagementModeHistory(ClusterManagementMode mode, PauseSignal pauseSignal,
diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
index 11a0c60..1c61624 100644
--- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
+++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
@@ -35,6 +35,7 @@
 import org.apache.helix.manager.zk.ZKHelixDataAccessor;
 import org.apache.helix.manager.zk.ZkBaseDataAccessor;
 import org.apache.helix.model.ClusterStatus;
+import org.apache.helix.model.ControllerHistory;
 import org.apache.helix.model.LiveInstance;
 import org.testng.Assert;
 import org.testng.annotations.AfterClass;
@@ -93,15 +94,40 @@
     ClusterStatus clusterStatus = _accessor.getProperty(_accessor.keyBuilder().clusterStatus());
     Assert.assertEquals(clusterStatus.getManagementMode(), ClusterManagementMode.Type.CLUSTER_FREEZE);
 
+    ControllerHistory history =
+        _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertNull(history);
 
-    // Mark a live instance to be pause state
-    LiveInstance liveInstance = liveInstances.get(0);
-    liveInstance.setStatus(LiveInstance.LiveInstanceStatus.FROZEN);
-    PropertyKey liveInstanceKey =
-        _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
-    _accessor.updateProperty(liveInstanceKey, liveInstance);
+    // Mark both live instances to be frozen, then entering freeze mode is complete
+    for (int i = 0; i < 2; i++) {
+      LiveInstance liveInstance = liveInstances.get(i);
+      liveInstance.setStatus(LiveInstance.LiveInstanceStatus.FROZEN);
+      PropertyKey liveInstanceKey =
+          _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
+      _accessor.updateProperty(liveInstanceKey, liveInstance);
+    }
     // Require cache refresh
     cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
+    runPipeline(event, dataRefresh, false);
+    managementModeStage.process(event);
+
+    // Freeze mode is complete
+    clusterStatus = _accessor.getProperty(_accessor.keyBuilder().clusterStatus());
+    Assert.assertEquals(clusterStatus.getManagementMode(), ClusterManagementMode.Type.CLUSTER_FREEZE);
+    Assert.assertEquals(clusterStatus.getManagementModeStatus(),
+        ClusterManagementMode.Status.COMPLETED);
+
+    // Management history is recorded
+    history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertEquals(history.getManagementModeHistory().size(), 1);
+    String lastHistory = history.getManagementModeHistory().get(0);
+    Assert.assertTrue(lastHistory.contains("MODE=" + ClusterManagementMode.Type.CLUSTER_FREEZE));
+    Assert.assertTrue(lastHistory.contains("STATUS=" + ClusterManagementMode.Status.COMPLETED));
+
+    // No duplicate management mode history entries
+    managementModeStage.process(event);
+    history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertEquals(history.getManagementModeHistory().size(), 1);
 
     // Unfreeze cluster
     request = ClusterManagementModeRequest.newBuilder()
@@ -119,11 +145,15 @@
     Assert.assertEquals(clusterStatus.getManagementModeStatus(),
         ClusterManagementMode.Status.IN_PROGRESS);
 
-    // remove froze status to mark the live instance to be normal status
-    liveInstance = _accessor.getProperty(liveInstanceKey);
-    liveInstance.getRecord().getSimpleFields()
-        .remove(LiveInstance.LiveInstanceProperty.STATUS.name());
-    _accessor.setProperty(liveInstanceKey, liveInstance);
+    // remove froze status to mark the live instances to be normal status
+    for (int i = 0; i < 2; i++) {
+      LiveInstance liveInstance = liveInstances.get(i);
+      PropertyKey liveInstanceKey =
+          _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
+      liveInstance.getRecord().getSimpleFields()
+          .remove(LiveInstance.LiveInstanceProperty.STATUS.name());
+      _accessor.setProperty(liveInstanceKey, liveInstance);
+    }
     // Require cache refresh
     cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
     runPipeline(event, dataRefresh, false);
diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
index e068f38..6910263 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
@@ -172,6 +172,12 @@
     expectedClusterStatus.setManagementModeStatus(ClusterManagementMode.Status.IN_PROGRESS);
     verifyClusterStatus(expectedClusterStatus);
 
+    // Verify management mode history is empty
+    ControllerHistory controllerHistory =
+        _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    List<String> managementHistory = controllerHistory.getManagementModeHistory();
+    Assert.assertTrue(managementHistory.isEmpty());
+
     // Unblock to finish state transition and delete the ST message
     latch.countDown();
 
@@ -185,12 +191,17 @@
 
     // Verify management mode history
     Assert.assertTrue(TestHelper.verify(() -> {
-      ControllerHistory history = _accessor.getProperty(keyBuilder.controllerLeaderHistory());
-      List<String> managementHistory = history.getManagementModeHistory();
-      if (managementHistory == null || managementHistory.isEmpty()) {
+      ControllerHistory tmpControllerHistory =
+          _accessor.getProperty(keyBuilder.controllerLeaderHistory());
+      List<String> tmpManagementHistory = tmpControllerHistory.getManagementModeHistory();
+      if (tmpManagementHistory == null || tmpManagementHistory.isEmpty()) {
         return false;
       }
-      String lastHistory = managementHistory.get(managementHistory.size() - 1);
+      // Should not have duplicate entries
+      if (tmpManagementHistory.size() > 1) {
+        return false;
+      }
+      String lastHistory = tmpManagementHistory.get(0);
       return lastHistory.contains("MODE=" + ClusterManagementMode.Type.CLUSTER_FREEZE)
           && lastHistory.contains("STATUS=" + ClusterManagementMode.Status.COMPLETED)
           && lastHistory.contains("REASON=" + methodName);