AMBARI-21593 : AMS stopped after RU [AMS distributed mode with 2 collectors] (avijayan)
diff --git a/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java b/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
index 53e6304..addb14e 100644
--- a/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
+++ b/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
@@ -26,6 +26,7 @@
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.MetricsSystemInitializationException;
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.TimelineMetricConfiguration;
 import org.apache.helix.HelixAdmin;
+import org.apache.helix.HelixException;
 import org.apache.helix.HelixManager;
 import org.apache.helix.HelixManagerFactory;
 import org.apache.helix.InstanceType;
@@ -123,20 +124,41 @@
     admin = new ZKHelixAdmin(zkConnectUrl);
     // create cluster
     LOG.info("Creating zookeeper cluster node: " + clusterName);
-    admin.addCluster(clusterName, false);
+    boolean clusterAdded = admin.addCluster(clusterName, false);
+    LOG.info("Was cluster added successfully? " + clusterAdded);
 
     // Adding host to the cluster
-    List<String> nodes = Collections.EMPTY_LIST;
-    try {
-      nodes =  admin.getInstancesInCluster(clusterName);
-    } catch (ZkNoNodeException ex) {
-      LOG.warn("Child znode under /" + CLUSTER_NAME + " not found.Recreating the cluster.");
-        admin.addCluster(clusterName, true);
+    boolean success = false;
+    int tries = 5;
+    int sleepTimeInSeconds = 5;
+
+    for (int i = 0; i < tries && !success; i++) {
+      try {
+        List<String> nodes = admin.getInstancesInCluster(clusterName);
+        if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
+          LOG.info("Adding participant instance " + instanceConfig);
+          admin.addInstance(clusterName, instanceConfig);
+          success = true;
+        }
+      } catch (HelixException | ZkNoNodeException ex) {
+        LOG.warn("Helix Cluster not yet setup fully.");
+        if (i < tries - 1) {
+          LOG.info("Waiting for " + sleepTimeInSeconds + " seconds and retrying.");
+          TimeUnit.SECONDS.sleep(sleepTimeInSeconds);
+        } else {
+          LOG.error(ex);
+        }
+      }
     }
 
-    if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
-      LOG.info("Adding participant instance " + instanceConfig);
-      admin.addInstance(clusterName, instanceConfig);
+    if (!success) {
+      LOG.info("Trying to create " + clusterName + " again since waiting for the creation did not help.");
+      admin.addCluster(clusterName, true);
+      List<String> nodes = admin.getInstancesInCluster(clusterName);
+      if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
+        LOG.info("Adding participant instance " + instanceConfig);
+        admin.addInstance(clusterName, instanceConfig);
+      }
     }
 
     // Add a state model