AMBARI-21593 : AMS stopped after RU [AMS distributed mode with 2 collectors] (avijayan)
diff --git a/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java b/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
index 53e6304..addb14e 100644
--- a/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
+++ b/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/availability/MetricCollectorHAController.java
@@ -26,6 +26,7 @@
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.MetricsSystemInitializationException;
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.TimelineMetricConfiguration;
import org.apache.helix.HelixAdmin;
+import org.apache.helix.HelixException;
import org.apache.helix.HelixManager;
import org.apache.helix.HelixManagerFactory;
import org.apache.helix.InstanceType;
@@ -123,20 +124,41 @@
admin = new ZKHelixAdmin(zkConnectUrl);
// create cluster
LOG.info("Creating zookeeper cluster node: " + clusterName);
- admin.addCluster(clusterName, false);
+ boolean clusterAdded = admin.addCluster(clusterName, false);
+ LOG.info("Was cluster added successfully? " + clusterAdded);
// Adding host to the cluster
- List<String> nodes = Collections.EMPTY_LIST;
- try {
- nodes = admin.getInstancesInCluster(clusterName);
- } catch (ZkNoNodeException ex) {
- LOG.warn("Child znode under /" + CLUSTER_NAME + " not found.Recreating the cluster.");
- admin.addCluster(clusterName, true);
+ boolean success = false;
+ int tries = 5;
+ int sleepTimeInSeconds = 5;
+
+ for (int i = 0; i < tries && !success; i++) {
+ try {
+ List<String> nodes = admin.getInstancesInCluster(clusterName);
+ if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
+ LOG.info("Adding participant instance " + instanceConfig);
+ admin.addInstance(clusterName, instanceConfig);
+ success = true;
+ }
+ } catch (HelixException | ZkNoNodeException ex) {
+ LOG.warn("Helix Cluster not yet setup fully.");
+ if (i < tries - 1) {
+ LOG.info("Waiting for " + sleepTimeInSeconds + " seconds and retrying.");
+ TimeUnit.SECONDS.sleep(sleepTimeInSeconds);
+ } else {
+ LOG.error(ex);
+ }
+ }
}
- if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
- LOG.info("Adding participant instance " + instanceConfig);
- admin.addInstance(clusterName, instanceConfig);
+ if (!success) {
+ LOG.info("Trying to create " + clusterName + " again since waiting for the creation did not help.");
+ admin.addCluster(clusterName, true);
+ List<String> nodes = admin.getInstancesInCluster(clusterName);
+ if (CollectionUtils.isEmpty(nodes) || !nodes.contains(instanceConfig.getInstanceName())) {
+ LOG.info("Adding participant instance " + instanceConfig);
+ admin.addInstance(clusterName, instanceConfig);
+ }
}
// Add a state model