YARN-3359. Recover collector list when RM fails over (Li Lu via Varun Saxena)
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
index cd2ec5d..30ec282 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
@@ -64,6 +64,7 @@
 import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.queuing.QueuingContainerManagerImpl;
 import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
@@ -459,8 +460,14 @@
           if (!rmWorkPreservingRestartEnabled) {
             LOG.info("Cleaning up running containers on resync");
             containerManager.cleanupContainersOnNMResync();
+            // Clear all known collectors for resync.
+            if (context.getKnownCollectors() != null) {
+              context.getKnownCollectors().clear();
+            }
           } else {
             LOG.info("Preserving containers on resync");
+            // Re-register known timeline collectors.
+            reregisterCollectors();
           }
           ((NodeStatusUpdaterImpl) nodeStatusUpdater)
             .rebootNodeStatusUpdaterAndRegisterWithRM();
@@ -472,6 +479,38 @@
     }.start();
   }
 
+  /**
+   * Reregisters all collectors known by this node to the RM. This method is
+   * called when the RM needs to resync with the node.
+   */
+  protected void reregisterCollectors() {
+    Map<ApplicationId, AppCollectorData> knownCollectors
+        = context.getKnownCollectors();
+    if (knownCollectors == null) {
+      return;
+    }
+    Map<ApplicationId, AppCollectorData> registeringCollectors
+        = context.getRegisteringCollectors();
+    for (Map.Entry<ApplicationId, AppCollectorData> entry
+        : knownCollectors.entrySet()) {
+      Application app = context.getApplications().get(entry.getKey());
+      if ((app != null)
+          && !ApplicationState.FINISHED.equals(app.getApplicationState())) {
+        registeringCollectors.putIfAbsent(entry.getKey(), entry.getValue());
+        AppCollectorData data = entry.getValue();
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(entry.getKey() + " : " + data.getCollectorAddr() + "@<"
+              + data.getRMIdentifier() + ", " + data.getVersion() + ">");
+        }
+      } else {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Remove collector data for done app " + entry.getKey());
+        }
+      }
+    }
+    knownCollectors.clear();
+  }
+
   public static class NMContext implements Context {
 
     private NodeId nodeId = null;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
index d3886b2..6558a77 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java
@@ -655,17 +655,21 @@
             LOG.warn("Cannot update collector info because application ID: " +
                 appId + " is not found in RMContext!");
           } else {
-            AppCollectorData previousCollectorData = rmApp.getCollectorData();
-            if (AppCollectorData.happensBefore(previousCollectorData,
-                collectorData)) {
-              // Sending collector update event.
-              // Note: RM has to store the newly received collector data
-              // synchronously. Otherwise, the RM may send out stale collector
-              // data before this update is done, and the RM then crashes, the
-              // newly updated collector data will get lost.
-              LOG.info("Update collector information for application " + appId
-                  + " with new address: " + collectorData.getCollectorAddr());
-              ((RMAppImpl) rmApp).setCollectorData(collectorData);
+            synchronized (rmApp) {
+              AppCollectorData previousCollectorData = rmApp.getCollectorData();
+              if (AppCollectorData.happensBefore(previousCollectorData,
+                  collectorData)) {
+                // Sending collector update event.
+                // Note: RM has to store the newly received collector data
+                // synchronously. Otherwise, the RM may send out stale collector
+                // data before this update is done, and the RM then crashes, the
+                // newly updated collector data will get lost.
+                LOG.info("Update collector information for application " + appId
+                    + " with new address: " + collectorData.getCollectorAddr()
+                    + " timestamp: " + collectorData.getRMIdentifier()
+                    + ", " + collectorData.getVersion());
+                ((RMAppImpl) rmApp).setCollectorData(collectorData);
+              }
             }
           }
         }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
index 32cdb1b..842520d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
@@ -24,6 +24,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
@@ -39,6 +40,7 @@
 import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
 import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
+import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
 import org.apache.hadoop.yarn.server.api.records.MasterKey;
 import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
 import org.apache.hadoop.yarn.server.api.records.NodeStatus;
@@ -60,6 +62,8 @@
   private String version;
   private Map<ContainerId, ContainerStatus> containerStats =
       new HashMap<ContainerId, ContainerStatus>();
+  private Map<ApplicationId, AppCollectorData> registeringCollectors
+      = new ConcurrentHashMap<>();
 
   public MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) {
     // scale vcores based on the requested memory
@@ -117,6 +121,15 @@
         true, ++responseId);
   }
 
+  public void addRegisteringCollector(ApplicationId appId,
+      AppCollectorData data) {
+    this.registeringCollectors.put(appId, data);
+  }
+
+  public Map<ApplicationId, AppCollectorData> getRegisteringCollectors() {
+    return this.registeringCollectors;
+  }
+
   public RegisterNodeManagerResponse registerNode() throws Exception {
     return registerNode(null, null);
   }
@@ -223,6 +236,9 @@
     req.setNodeStatus(status);
     req.setLastKnownContainerTokenMasterKey(this.currentContainerTokenMasterKey);
     req.setLastKnownNMTokenMasterKey(this.currentNMTokenMasterKey);
+
+    req.setRegisteringCollectors(this.registeringCollectors);
+
     NodeHeartbeatResponse heartbeatResponse =
         resourceTracker.nodeHeartbeat(req);
     
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
new file mode 100644
index 0000000..a54ff34
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHATimelineCollectors.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test if the new active RM could recover collector status on a state
+ * transition.
+ */
+public class TestRMHATimelineCollectors extends RMHATestBase {
+  public static final Log LOG = LogFactory
+      .getLog(TestSubmitApplicationWithRMHA.class);
+
+  @Before
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    confForRM1.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
+    confForRM2.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
+    confForRM1.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
+    confForRM2.setFloat(YarnConfiguration.TIMELINE_SERVICE_VERSION, 2.0f);
+  }
+
+  @Test
+  public void testRebuildCollectorDataOnFailover() throws Exception {
+    startRMs();
+    MockNM nm1
+        = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
+    MockNM nm2
+        = new MockNM("127.0.0.1:5678", 15121, rm2.getResourceTrackerService());
+    RMApp app1 = rm1.submitApp(1024);
+    String collectorAddr1 = "1.2.3.4:5";
+    AppCollectorData data1 = AppCollectorData.newInstance(
+        app1.getApplicationId(), collectorAddr1);
+    nm1.addRegisteringCollector(app1.getApplicationId(), data1);
+
+    String collectorAddr2 = "5.4.3.2:1";
+    RMApp app2 = rm1.submitApp(1024);
+    AppCollectorData data2 = AppCollectorData.newInstance(
+        app2.getApplicationId(), collectorAddr2, rm1.getStartTime(), 1);
+    nm1.addRegisteringCollector(app2.getApplicationId(), data2);
+
+    explicitFailover();
+
+    List<ApplicationId> runningApps = new ArrayList<>();
+    runningApps.add(app1.getApplicationId());
+    runningApps.add(app2.getApplicationId());
+    nm1.registerNode(runningApps);
+    nm2.registerNode(runningApps);
+
+    String collectorAddr12 = "1.2.3.4:56";
+    AppCollectorData data12 = AppCollectorData.newInstance(
+        app1.getApplicationId(), collectorAddr12, rm1.getStartTime(), 0);
+    nm2.addRegisteringCollector(app1.getApplicationId(), data12);
+
+    String collectorAddr22 = "5.4.3.2:10";
+    AppCollectorData data22 = AppCollectorData.newInstance(
+        app2.getApplicationId(), collectorAddr22, rm1.getStartTime(), 2);
+    nm2.addRegisteringCollector(app2.getApplicationId(), data22);
+
+    Map<ApplicationId, AppCollectorData> results1
+        = nm1.nodeHeartbeat(true).getAppCollectors();
+    assertEquals(collectorAddr1,
+        results1.get(app1.getApplicationId()).getCollectorAddr());
+    assertEquals(collectorAddr2,
+        results1.get(app2.getApplicationId()).getCollectorAddr());
+
+    Map<ApplicationId, AppCollectorData> results2
+        = nm2.nodeHeartbeat(true).getAppCollectors();
+    // addr of app1 should be collectorAddr1 since it's registering (no time
+    // stamp).
+    assertEquals(collectorAddr1,
+        results2.get(app1.getApplicationId()).getCollectorAddr());
+    // addr of app2 should be collectorAddr22 since its version number is
+    // greater.
+    assertEquals(collectorAddr22,
+        results2.get(app2.getApplicationId()).getCollectorAddr());
+
+    // Now nm1 should get updated collector list
+    nm1.getRegisteringCollectors().clear();
+    Map<ApplicationId, AppCollectorData> results12
+        = nm1.nodeHeartbeat(true).getAppCollectors();
+    assertEquals(collectorAddr1,
+        results12.get(app1.getApplicationId()).getCollectorAddr());
+    assertEquals(collectorAddr22,
+        results12.get(app2.getApplicationId()).getCollectorAddr());
+
+
+  }
+}