[AMBARI-25244] : Rack based parallel restart for Rolling Upgrade (#2996)
* [AMBARI-25244] : Rack based parallel restart for Rolling Upgrade
* unit test update
* minor change
* minor changes
* adding test cases
* minor changes
* review based changes
diff --git a/ambari-server/pom.xml b/ambari-server/pom.xml
index 24b95db..ab7cd11 100644
--- a/ambari-server/pom.xml
+++ b/ambari-server/pom.xml
@@ -1616,6 +1616,11 @@
<version>1.10.1</version>
</dependency>
<dependency>
+ <groupId>com.esotericsoftware.yamlbeans</groupId>
+ <artifactId>yamlbeans</artifactId>
+ <version>1.13</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.ambari</groupId>
<artifactId>ambari-metrics-common</artifactId>
<version>${ambari.metrics.version}</version>
diff --git a/ambari-server/src/examples/rack_hosts.yaml b/ambari-server/src/examples/rack_hosts.yaml
new file mode 100644
index 0000000..ef7a4a3
--- /dev/null
+++ b/ambari-server/src/examples/rack_hosts.yaml
@@ -0,0 +1,116 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+racks:
+ racka-1:
+ hostGroups:
+ - hosts:
+ - cluster12b-slave1-237.abc.xyz.com
+ - cluster12b-slave1-274.abc.xyz.com
+ - cluster12b-slave1-278.abc.xyz.com
+ - cluster12b-slave1-282.abc.xyz.com
+ - cluster12b-slave1-363.abc.xyz.com
+ - cluster12b-slave1-378.abc.xyz.com
+ - cluster12b-slave1-380.abc.xyz.com
+ - cluster12b-slave1-430.abc.xyz.com
+ - hosts:
+ - cluster12a-slave5-1.abc.xyz.com
+ - cluster12a-slave5-2.abc.xyz.com
+ - cluster12a-slave5-3.abc.xyz.com
+ - cluster12a-slave5-4.abc.xyz.com
+ - cluster12a-master5-1.abc.xyz.com
+ - cluster12a-master5-2.abc.xyz.com
+ - cluster12a-master5-3.abc.xyz.com
+ - cluster12b-slave1-141.abc.xyz.com
+ - cluster12b-slave1-163.abc.xyz.com
+ - cluster12b-slave1-176.abc.xyz.com
+ - cluster12b-slave1-5.abc.xyz.com
+ - cluster12b-slave1-72.abc.xyz.com
+ - cluster12b-master2-2.abc.xyz.com
+ rackb-22:
+ hosts:
+ - cluster12a-slave39-1.abc.xyz.com
+ - cluster12a-slave39-2.abc.xyz.com
+ - cluster12a-slave39-3.abc.xyz.com
+ - cluster12a-slave39-4.abc.xyz.com
+ - cluster12a-slave39-5.abc.xyz.com
+ - cluster12a-slave39-6.abc.xyz.com
+ - cluster12b-slave1-162.abc.xyz.com
+ - cluster12b-slave1-242.abc.xyz.com
+ - cluster12b-slave1-336.abc.xyz.com
+ - cluster12b-slave1-360.abc.xyz.com
+ - cluster12b-slave1-376.abc.xyz.com
+ - cluster12b-master1-2.abc.xyz.com
+ rackc-3:
+ hostGroups:
+ - hosts:
+ - cluster12b-slave1-339.abc.xyz.com
+ - hosts:
+ - cluster12a-slave19-1.abc.xyz.com
+ - cluster12a-slave19-2.abc.xyz.com
+ - cluster12a-slave19-3.abc.xyz.com
+ - cluster12a-slave19-4.abc.xyz.com
+ - cluster12b-slave1-120.abc.xyz.com
+ - cluster12b-slave1-165.abc.xyz.com
+ - cluster12b-slave1-232.abc.xyz.com
+ - cluster12b-slave1-281.abc.xyz.com
+ - cluster12b-slave1-29.abc.xyz.com
+ - cluster12b-slave1-314.abc.xyz.com
+ - cluster12b-slave1-328.abc.xyz.com
+ - cluster12b-slave1-334.abc.xyz.com
+ - cluster12b-slave1-36.abc.xyz.com
+ rackd-11:
+ hosts:
+ - cluster12a-slave50-1.abc.xyz.com
+ - cluster12a-slave50-2.abc.xyz.com
+ - cluster12a-slave50-3.abc.xyz.com
+ - cluster12a-slave50-4.abc.xyz.com
+ racke-122:
+ hosts:
+ - cluster12a-slave57-1.abc.xyz.com
+ - cluster12a-slave57-2.abc.xyz.com
+ - cluster12a-slave57-3.abc.xyz.com
+ - cluster12b-slave1-171.abc.xyz.com
+ - cluster12b-slave1-178.abc.xyz.com
+ - cluster12b-slave1-213.abc.xyz.com
+ - cluster12b-slave1-269.abc.xyz.com
+ - cluster12b-slave1-28.abc.xyz.com
+ - cluster12b-slave1-293.abc.xyz.com
+ - cluster12b-slave1-298.abc.xyz.com
+ - cluster12b-slave1-423.abc.xyz.com
+ - cluster12b-slave1-437.abc.xyz.com
+ - cluster12b-slave1-56.abc.xyz.com
+ racka-98:
+ hostGroups:
+ - hosts:
+ - cluster12b-slave1-356.abc.xyz.com
+ - cluster12b-slave1-459.abc.xyz.com
+ - cluster12b-slave1-460.abc.xyz.com
+ - hosts:
+ - cluster12a-slave43-1.abc.xyz.com
+ - cluster12a-slave43-2.abc.xyz.com
+ - cluster12b-slave1-1.abc.xyz.com
+ - cluster12b-slave1-11.abc.xyz.com
+ - cluster12b-slave1-160.abc.xyz.com
+ - cluster12b-slave1-173.abc.xyz.com
+ - cluster12b-slave1-229.abc.xyz.com
+ - cluster12b-slave1-249.abc.xyz.com
+ - cluster12b-slave1-38.abc.xyz.com
+ - cluster12b-slave1-59.abc.xyz.com
+ - cluster12b-slave1-62.abc.xyz.com
+ - cluster12b-slave1-76.abc.xyz.com
+ - cluster12b-slave1-78.abc.xyz.com
diff --git a/ambari-server/src/main/java/org/apache/ambari/server/stack/upgrade/Grouping.java b/ambari-server/src/main/java/org/apache/ambari/server/stack/upgrade/Grouping.java
index dbc9f3b..b22e246 100644
--- a/ambari-server/src/main/java/org/apache/ambari/server/stack/upgrade/Grouping.java
+++ b/ambari-server/src/main/java/org/apache/ambari/server/stack/upgrade/Grouping.java
@@ -17,6 +17,8 @@
*/
package org.apache.ambari.server.stack.upgrade;
+import java.io.FileReader;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -39,9 +41,11 @@
import org.apache.ambari.server.stack.upgrade.orchestrate.TaskWrapper;
import org.apache.ambari.server.stack.upgrade.orchestrate.TaskWrapperBuilder;
import org.apache.ambari.server.stack.upgrade.orchestrate.UpgradeContext;
+import org.apache.ambari.server.state.ConfigHelper;
import org.apache.ambari.server.utils.SetUtils;
import org.apache.commons.lang.StringUtils;
+import com.esotericsoftware.yamlbeans.YamlReader;
import com.google.common.base.MoreObjects;
/**
@@ -52,6 +56,10 @@
StartGrouping.class, StopGrouping.class, HostOrderGrouping.class, ParallelClientGrouping.class })
public class Grouping {
+ private static final String RACKS_YAML_KEY_NAME = "racks";
+ private static final String HOSTS_YAML_KEY_NAME = "hosts";
+ private static final String HOST_GROUPS_YAML_KEY_NAME = "hostGroups";
+
@XmlAttribute(name="name")
public String name;
@@ -217,7 +225,8 @@
* @param pc Processing Component
* @param params Params to add to the stage.
*/
- private void addTasksToStageInBatches(List<TaskWrapper> tasks, String verb, UpgradeContext ctx, String service, ProcessingComponent pc, Map<String, String> params) {
+ private void addTasksToStageInBatches(List<TaskWrapper> tasks, String verb, UpgradeContext ctx, String service,
+ ProcessingComponent pc, Map<String, String> params) {
if (tasks == null || tasks.isEmpty() || tasks.get(0).getTasks() == null || tasks.get(0).getTasks().isEmpty()) {
return;
}
@@ -230,8 +239,25 @@
List<Set<String>> hostSets = null;
int parallel = getParallelHostCount(ctx, 1);
- hostSets = SetUtils.split(tw.getHosts(), parallel);
-
+ final String rackYamlFile =
+ ctx.getResolver().getValueFromDesiredConfigurations(ConfigHelper.CLUSTER_ENV, "rack_yaml_file_path");
+ if (StringUtils.isNotEmpty(rackYamlFile)) {
+ // If rack to hosts mapping yaml file path is present in cluster-env property: rack_yaml_file_path,
+ // host sets will be formed based on rack i.e. based on parallel value, hosts present on same rack will
+ // be part of the same batch. This is useful when we want to avoid possibility of single rack failure
+ Map<String, Set<String>> hostsByRack = organizeHostsByRack(tw.getHosts(), rackYamlFile);
+ List<Set<String>> hostSetsForRack;
+ for (String rack : hostsByRack.keySet()) {
+ hostSetsForRack = SetUtils.split(hostsByRack.get(rack), parallel);
+ if (hostSets == null) {
+ hostSets = hostSetsForRack;
+ } else {
+ hostSets.addAll(hostSetsForRack);
+ }
+ }
+ } else {
+ hostSets = SetUtils.split(tw.getHosts(), parallel);
+ }
int numBatchesNeeded = hostSets.size();
int batchNum = 0;
for (Set<String> hostSubset : hostSets) {
@@ -250,6 +276,71 @@
}
/**
+ * Utility method to organize and return Rack to Hosts mapping for given rack yaml file
+ *
+ * @param hosts : All hosts that are part of current group
+ * @param rackYamlFile : file path for yaml containing rack to hosts mapping
+ * e.g ambari-server/src/examples/rack_hosts.yaml
+ * @return
+ */
+ private Map<String, Set<String>> organizeHostsByRack(Set<String> hosts, String rackYamlFile) {
+ try {
+ Map<String, String> hostToRackMap = getHostToRackMap(rackYamlFile);
+ Map<String, Set<String>> rackToHostsMap = new HashMap<>();
+ for (String host : hosts) {
+ if (hostToRackMap.containsKey(host)) {
+ String rack = hostToRackMap.get(host);
+ if (!rackToHostsMap.containsKey(rack)) {
+ rackToHostsMap.put(rack, new HashSet<>());
+ }
+ rackToHostsMap.get(rack).add(host);
+ } else {
+ throw new RuntimeException(String.format("Rack mapping is not present for host name: %s", host));
+ }
+ }
+ return rackToHostsMap;
+ } catch (Exception e) {
+ throw new RuntimeException(
+ String.format("Failed to generate Rack to Hosts mapping. filePath: %s", rackYamlFile), e);
+ }
+ }
+
+ private static Map<String, String> getHostToRackMap(String rackYamlFile)
+ throws IOException {
+ YamlReader yamlReader = new YamlReader(new FileReader(rackYamlFile));
+ Map rackHostsMap;
+ try {
+ rackHostsMap = (Map) yamlReader.read();
+ } finally {
+ yamlReader.close();
+ }
+ Map racks = (Map) rackHostsMap.get(RACKS_YAML_KEY_NAME);
+ Map<String, String> hostToRackMap = new HashMap<>();
+ for (Map.Entry entry : (Set<Map.Entry>) racks.entrySet()) {
+ Map rackInfoMap = (Map) entry.getValue();
+ String rackName = (String) entry.getKey();
+ if (rackInfoMap.containsKey(HOSTS_YAML_KEY_NAME)) {
+ List<String> hostList = (List<String>) rackInfoMap.get(HOSTS_YAML_KEY_NAME);
+ for (String host : hostList) {
+ hostToRackMap.put(host, rackName);
+ }
+ }
+ if (rackInfoMap.containsKey(HOST_GROUPS_YAML_KEY_NAME)) {
+ List<Map> hostGroups = (List<Map>) rackInfoMap.get(HOST_GROUPS_YAML_KEY_NAME);
+ for (Map hostGroup : hostGroups) {
+ if (hostGroup.containsKey(HOSTS_YAML_KEY_NAME)) {
+ List<String> hostList = (List<String>) hostGroup.get(HOSTS_YAML_KEY_NAME);
+ for (String host : hostList) {
+ hostToRackMap.put(host, rackName);
+ }
+ }
+ }
+ }
+ }
+ return hostToRackMap;
+ }
+
+ /**
* Determine if service checks need to be ran after the stages.
* @param upgradeContext the upgrade context
* @return Return the stages, which may potentially be followed by service checks.
diff --git a/ambari-server/src/test/java/org/apache/ambari/server/stack/upgrade/orchestrate/UpgradeHelperTest.java b/ambari-server/src/test/java/org/apache/ambari/server/stack/upgrade/orchestrate/UpgradeHelperTest.java
index ebe69c7..124a6e0 100644
--- a/ambari-server/src/test/java/org/apache/ambari/server/stack/upgrade/orchestrate/UpgradeHelperTest.java
+++ b/ambari-server/src/test/java/org/apache/ambari/server/stack/upgrade/orchestrate/UpgradeHelperTest.java
@@ -33,7 +33,9 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import java.io.File;
import java.lang.reflect.Field;
+import java.nio.charset.Charset;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.Collection;
@@ -114,17 +116,21 @@
import org.apache.ambari.server.state.StackId;
import org.apache.ambari.server.state.UpgradeState;
import org.apache.ambari.server.utils.EventBusSynchronizer;
+import org.apache.ambari.server.utils.StageUtils;
import org.apache.ambari.spi.ClusterInformation;
import org.apache.ambari.spi.RepositoryType;
import org.apache.ambari.spi.upgrade.OrchestrationOptions;
import org.apache.ambari.spi.upgrade.UpgradeType;
+import org.apache.commons.io.FileUtils;
import org.easymock.Capture;
import org.easymock.EasyMock;
import org.easymock.EasyMockSupport;
import org.easymock.IAnswer;
import org.junit.After;
import org.junit.Before;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
import org.springframework.security.core.context.SecurityContextHolder;
import com.google.common.collect.ImmutableMap;
@@ -165,6 +171,9 @@
private RepositoryVersionEntity repositoryVersion2210;
private HostsType namenodeHosts = HostsType.highAvailability("h1", "h2", newLinkedHashSet(Arrays.asList("h1", "h2")));
+ @Rule
+ public TemporaryFolder tmpFolder = new TemporaryFolder();
+
/**
* Because test cases need to share config mocks, put common ones in this function.
* @throws Exception
@@ -536,8 +545,7 @@
UpgradeType.ROLLING, repositoryVersion2210);
// use a "real" master host resolver here so that we can actually test MM
- MasterHostResolver masterHostResolver = new MasterHostResolver(cluster, null, context);
-
+ MasterHostResolver masterHostResolver = new MasterHostResolver(cluster, m_configHelper, context);
expect(context.getResolver()).andReturn(masterHostResolver).anyTimes();
replay(context);
@@ -1212,7 +1220,7 @@
// HBASE and PIG have service checks, but not TEZ.
Set<String> additionalServices = new HashSet<String>() {{ add("HBASE"); add("PIG"); add("TEZ"); add("AMBARI_METRICS"); }};
- Cluster c = makeCluster(true, additionalServices);
+ Cluster c = makeCluster(true, additionalServices, "");
int numServiceChecksExpected = 0;
Collection<Service> services = c.getServices().values();
@@ -1399,14 +1407,15 @@
* @throws AmbariException
*/
private Cluster makeCluster(boolean clean) throws AmbariException, AuthorizationException {
- return makeCluster(clean, new HashSet<>());
+ return makeCluster(clean, new HashSet<>(), "");
}
/**
* Create an HA cluster
* @throws AmbariException
*/
- private Cluster makeCluster(boolean clean, Set<String> additionalServices) throws AmbariException, AuthorizationException {
+ private Cluster makeCluster(boolean clean, Set<String> additionalServices, String yamlFileName)
+ throws AmbariException, AuthorizationException {
Clusters clusters = injector.getInstance(Clusters.class);
ServiceFactory serviceFactory = injector.getInstance(ServiceFactory.class);
@@ -1536,7 +1545,9 @@
expect(m_masterHostResolver.getMasterAndHosts("OOZIE", "OOZIE_CLIENT")).andReturn(type).anyTimes();
expect(m_masterHostResolver.getCluster()).andReturn(c).anyTimes();
-
+ expect(m_masterHostResolver
+ .getValueFromDesiredConfigurations("cluster-env", "rack_yaml_file_path"))
+ .andReturn(yamlFileName).anyTimes();
for(String service : additionalServices) {
c.addService(service, repositoryVersion);
if (service.equals("HBASE")) {
@@ -2101,7 +2112,8 @@
type = HostsType.normal("h1", "h2");
expect(m_masterHostResolver.getMasterAndHosts("ZOOKEEPER", "ZOOKEEPER_CLIENT")).andReturn(type).anyTimes();
-
+ expect(m_masterHostResolver.getValueFromDesiredConfigurations("cluster-env", "rack_yaml_file_path")).andReturn("")
+ .anyTimes();
replay(m_masterHostResolver);
Map<String, UpgradePack> upgrades = ambariMetaInfo.getUpgradePacks("HDP", "2.1.1");
@@ -2431,6 +2443,113 @@
EasyMock.verify(kerberosDetails);
}
+ @Test
+ public void testUpgradeOrchestrationWithRack() throws Exception {
+ Map<String, UpgradePack> upgrades = ambariMetaInfo.getUpgradePacks("foo", "bar");
+ assertTrue(upgrades.isEmpty());
+ upgrades = ambariMetaInfo.getUpgradePacks("HDP", "2.1.1");
+ ServiceInfo serviceInfo = ambariMetaInfo.getService("HDP", "2.2.0", "ZOOKEEPER");
+ serviceInfo.setDisplayName("Zk");
+ ComponentInfo componentInfo = serviceInfo.getComponentByName("ZOOKEEPER_SERVER");
+ componentInfo.setDisplayName("ZooKeeper1 Server2");
+ assertTrue(upgrades.containsKey("upgrade_test"));
+ UpgradePack upgrade = upgrades.get("upgrade_test");
+ assertNotNull(upgrade);
+ File file = tmpFolder.newFile("rack_config.yaml");
+ FileUtils.writeStringToFile(file, "racks:\n" +
+ " racka-1:\n" +
+ " hostGroups:\n" +
+ " - hosts:\n" +
+ " - h2\n" +
+ " - h4\n" +
+ " - hosts:\n" +
+ " - h3\n" +
+ " - h5\n" +
+ " rackb-22:\n" +
+ " hosts:\n" +
+ " - h1\n" +
+ " - " + StageUtils.getHostName() + "\n", Charset.defaultCharset(), false);
+ Cluster cluster = makeCluster(true, new HashSet<>(), file.getAbsolutePath());
+ UpgradeContext context = getMockUpgradeContext(cluster, Direction.UPGRADE, UpgradeType.ROLLING);
+ List<UpgradeGroupHolder> groups = m_upgradeHelper.createSequence(upgrade, context);
+ assertEquals(7, groups.size());
+ assertEquals("PRE_CLUSTER", groups.get(0).name);
+ assertEquals("ZOOKEEPER", groups.get(1).name);
+ assertEquals("CORE_MASTER", groups.get(2).name);
+ assertEquals("CORE_SLAVES", groups.get(3).name);
+ assertEquals("HIVE", groups.get(4).name);
+ assertEquals("OOZIE", groups.get(5).name);
+ UpgradeGroupHolder holder = groups.get(2);
+ boolean found = false;
+ for (StageWrapper sw : holder.items) {
+ if (sw.getTasksJson().contains("Upgrading your database")) {
+ found = true;
+ }
+ }
+ assertTrue("Expected to find replaced text for Upgrading", found);
+
+ UpgradeGroupHolder group = groups.get(1);
+ assertTrue(group.items.get(1).getText().contains("ZooKeeper1 Server2"));
+ assertEquals(group.items.get(5).getText(), "Service Check Zk");
+ group = groups.get(3);
+ assertEquals(8, group.items.size());
+ StageWrapper sw = group.items.get(3);
+ assertEquals("Validate Partial Upgrade", sw.getText());
+ assertEquals(1, sw.getTasks().size());
+ assertEquals(1, sw.getTasks().get(0).getTasks().size());
+ Task task = sw.getTasks().get(0).getTasks().get(0);
+ assertEquals(ManualTask.class, task.getClass());
+ ManualTask mt = (ManualTask) task;
+ assertTrue(mt.messages.get(0).contains("DataNode and NodeManager"));
+ assertNotNull(mt.structuredOut);
+ assertTrue(mt.structuredOut.contains("DATANODE"));
+ assertTrue(mt.structuredOut.contains("NODEMANAGER"));
+ UpgradeGroupHolder postGroup = groups.get(6);
+ assertEquals("POST_CLUSTER", postGroup.name);
+ assertEquals("Finalize Upgrade", postGroup.title);
+ assertEquals(3, postGroup.items.size());
+ assertEquals("Confirm Finalize", postGroup.items.get(0).getText());
+ assertEquals("Execute HDFS Finalize", postGroup.items.get(1).getText());
+ assertEquals("Save Cluster State", postGroup.items.get(2).getText());
+ assertEquals(StageWrapper.Type.SERVER_SIDE_ACTION, postGroup.items.get(2).getType());
+ assertEquals(4, groups.get(0).items.size());
+ assertEquals(6, groups.get(1).items.size());
+ assertEquals(9, groups.get(2).items.size());
+ assertEquals(8, groups.get(3).items.size());
+ stackManagerMock.invalidateCurrentPaths();
+ ambariMetaInfo.init();
+ }
+
+ @Test
+ public void testUpgradeOrchestrationWithRackError() throws Exception {
+ Map<String, UpgradePack> upgrades = ambariMetaInfo.getUpgradePacks("foo", "bar");
+ assertTrue(upgrades.isEmpty());
+ upgrades = ambariMetaInfo.getUpgradePacks("HDP", "2.1.1");
+ ServiceInfo serviceInfo = ambariMetaInfo.getService("HDP", "2.2.0", "ZOOKEEPER");
+ serviceInfo.setDisplayName("Zk");
+ ComponentInfo componentInfo = serviceInfo.getComponentByName("ZOOKEEPER_SERVER");
+ componentInfo.setDisplayName("ZooKeeper1 Server2");
+ assertTrue(upgrades.containsKey("upgrade_test"));
+ UpgradePack upgrade = upgrades.get("upgrade_test");
+ assertNotNull(upgrade);
+ File file = tmpFolder.newFile("rack_config.yaml");
+ FileUtils.writeStringToFile(file, "racks:\n" +
+ " racka-1:\n" +
+ " hostGroups:\n" +
+ " - hosts:\n" +
+ " - h4\n" +
+ " - hosts:\n" +
+ " - h5\n", Charset.defaultCharset(), false);
+ Cluster cluster = makeCluster(true, new HashSet<>(), file.getAbsolutePath());
+ UpgradeContext context = getMockUpgradeContext(cluster, Direction.UPGRADE, UpgradeType.ROLLING);
+ try {
+ m_upgradeHelper.createSequence(upgrade, context);
+ } catch (RuntimeException e) {
+ assertTrue(e.getCause().getMessage().contains("Rack mapping is not present for host name"));
+ }
+ }
+
+
/**
* Tests merging configurations between existing and new stack values on
* upgrade.