| /** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.hbase.favored; |
| |
| import static org.apache.hadoop.hbase.ServerName.NON_STARTCODE; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.commons.lang3.StringUtils; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.hbase.Cell.Type; |
| import org.apache.hadoop.hbase.CellBuilderFactory; |
| import org.apache.hadoop.hbase.CellBuilderType; |
| import org.apache.hadoop.hbase.HBaseIOException; |
| import org.apache.hadoop.hbase.HConstants; |
| import org.apache.hadoop.hbase.MetaTableAccessor; |
| import org.apache.hadoop.hbase.ServerName; |
| import org.apache.hadoop.hbase.TableName; |
| import org.apache.hadoop.hbase.client.Connection; |
| import org.apache.hadoop.hbase.client.ConnectionFactory; |
| import org.apache.hadoop.hbase.client.Put; |
| import org.apache.hadoop.hbase.client.RegionInfo; |
| import org.apache.hadoop.hbase.client.Table; |
| import org.apache.hadoop.hbase.master.RackManager; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; |
| import org.apache.yetus.audience.InterfaceAudience; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.hbase.thirdparty.com.google.common.collect.Lists; |
| import org.apache.hbase.thirdparty.com.google.common.collect.Sets; |
| import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.FavoredNodes; |
| |
| /** |
| * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence for racks, |
| * meta scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer} when needed (from |
| * within calls like {@link FavoredNodeLoadBalancer#randomAssignment(RegionInfo, List)}). |
| * All updates to favored nodes should only be done from {@link FavoredNodesManager} and not |
| * through this helper class (except for tests). |
| */ |
| @InterfaceAudience.Private |
| public class FavoredNodeAssignmentHelper { |
| private static final Logger LOG = LoggerFactory.getLogger(FavoredNodeAssignmentHelper.class); |
| private RackManager rackManager; |
| private Map<String, List<ServerName>> rackToRegionServerMap; |
| private List<String> uniqueRackList; |
| // This map serves as a cache for rack to sn lookups. The num of |
| // region server entries might not match with that is in servers. |
| private Map<String, String> regionServerToRackMap; |
| private Random random; |
| private List<ServerName> servers; |
| public static final byte [] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn"); |
| public final static short FAVORED_NODES_NUM = 3; |
| public final static short MAX_ATTEMPTS_FN_GENERATION = 10; |
| |
| public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) { |
| this(servers, new RackManager(conf)); |
| } |
| |
| public FavoredNodeAssignmentHelper(final List<ServerName> servers, |
| final RackManager rackManager) { |
| this.servers = servers; |
| this.rackManager = rackManager; |
| this.rackToRegionServerMap = new HashMap<>(); |
| this.regionServerToRackMap = new HashMap<>(); |
| this.uniqueRackList = new ArrayList<>(); |
| this.random = new Random(); |
| } |
| |
| // Always initialize() when FavoredNodeAssignmentHelper is constructed. |
| public void initialize() { |
| for (ServerName sn : this.servers) { |
| String rackName = getRackOfServer(sn); |
| List<ServerName> serverList = this.rackToRegionServerMap.get(rackName); |
| if (serverList == null) { |
| serverList = Lists.newArrayList(); |
| // Add the current rack to the unique rack list |
| this.uniqueRackList.add(rackName); |
| this.rackToRegionServerMap.put(rackName, serverList); |
| } |
| for (ServerName serverName : serverList) { |
| if (ServerName.isSameAddress(sn, serverName)) { |
| // The server is already present, ignore. |
| break; |
| } |
| } |
| serverList.add((sn)); |
| this.regionServerToRackMap.put(sn.getHostname(), rackName); |
| } |
| } |
| |
| /** |
| * Update meta table with favored nodes info |
| * @param regionToFavoredNodes map of RegionInfo's to their favored nodes |
| * @param connection connection to be used |
| * @throws IOException |
| */ |
| public static void updateMetaWithFavoredNodesInfo( |
| Map<RegionInfo, List<ServerName>> regionToFavoredNodes, |
| Connection connection) throws IOException { |
| List<Put> puts = new ArrayList<>(); |
| for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) { |
| Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue()); |
| if (put != null) { |
| puts.add(put); |
| } |
| } |
| MetaTableAccessor.putsToMetaTable(connection, puts); |
| LOG.info("Added " + puts.size() + " regions in META"); |
| } |
| |
| /** |
| * Update meta table with favored nodes info |
| * @param regionToFavoredNodes |
| * @param conf |
| * @throws IOException |
| */ |
| public static void updateMetaWithFavoredNodesInfo( |
| Map<RegionInfo, List<ServerName>> regionToFavoredNodes, |
| Configuration conf) throws IOException { |
| List<Put> puts = new ArrayList<>(); |
| for (Map.Entry<RegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) { |
| Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue()); |
| if (put != null) { |
| puts.add(put); |
| } |
| } |
| // Write the region assignments to the meta table. |
| // TODO: See above overrides take a Connection rather than a Configuration only the |
| // Connection is a short circuit connection. That is not going to good in all cases, when |
| // master and meta are not colocated. Fix when this favored nodes feature is actually used |
| // someday. |
| try (Connection connection = ConnectionFactory.createConnection(conf)) { |
| try (Table metaTable = connection.getTable(TableName.META_TABLE_NAME)) { |
| metaTable.put(puts); |
| } |
| } |
| LOG.info("Added " + puts.size() + " regions in META"); |
| } |
| |
| /** |
| * Generates and returns a Put containing the region info for the catalog table and the servers |
| * @return Put object |
| */ |
| private static Put makePutFromRegionInfo(RegionInfo regionInfo, List<ServerName> favoredNodeList) |
| throws IOException { |
| Put put = null; |
| if (favoredNodeList != null) { |
| long time = EnvironmentEdgeManager.currentTime(); |
| put = MetaTableAccessor.makePutFromRegionInfo(regionInfo, time); |
| byte[] favoredNodes = getFavoredNodes(favoredNodeList); |
| put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY) |
| .setRow(put.getRow()) |
| .setFamily(HConstants.CATALOG_FAMILY) |
| .setQualifier(FAVOREDNODES_QUALIFIER) |
| .setTimestamp(time) |
| .setType(Type.Put) |
| .setValue(favoredNodes) |
| .build()); |
| LOG.debug("Create the region {} with favored nodes {}", regionInfo.getRegionNameAsString(), |
| favoredNodeList); |
| } |
| return put; |
| } |
| |
| /** |
| * @param favoredNodes The PB'ed bytes of favored nodes |
| * @return the array of {@link ServerName} for the byte array of favored nodes. |
| * @throws IOException |
| */ |
| public static ServerName[] getFavoredNodesList(byte[] favoredNodes) throws IOException { |
| FavoredNodes f = FavoredNodes.parseFrom(favoredNodes); |
| List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList(); |
| ServerName[] servers = new ServerName[protoNodes.size()]; |
| int i = 0; |
| for (HBaseProtos.ServerName node : protoNodes) { |
| servers[i++] = ProtobufUtil.toServerName(node); |
| } |
| return servers; |
| } |
| |
| /** |
| * @param serverAddrList |
| * @return PB'ed bytes of {@link FavoredNodes} generated by the server list. |
| */ |
| public static byte[] getFavoredNodes(List<ServerName> serverAddrList) { |
| FavoredNodes.Builder f = FavoredNodes.newBuilder(); |
| for (ServerName s : serverAddrList) { |
| HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder(); |
| b.setHostName(s.getHostname()); |
| b.setPort(s.getPort()); |
| b.setStartCode(ServerName.NON_STARTCODE); |
| f.addFavoredNode(b.build()); |
| } |
| return f.build().toByteArray(); |
| } |
| |
| // Place the regions round-robin across the racks picking one server from each |
| // rack at a time. Start with a random rack, and a random server from every rack. |
| // If a rack doesn't have enough servers it will go to the next rack and so on. |
| // for choosing a primary. |
| // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible |
| // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6.. |
| // If there were fewer servers in one rack, say r3, which had 3 servers, one possible |
| // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ... |
| // The regions should be distributed proportionately to the racksizes |
| public void placePrimaryRSAsRoundRobin(Map<ServerName, List<RegionInfo>> assignmentMap, |
| Map<RegionInfo, ServerName> primaryRSMap, List<RegionInfo> regions) { |
| List<String> rackList = new ArrayList<>(rackToRegionServerMap.size()); |
| rackList.addAll(rackToRegionServerMap.keySet()); |
| int rackIndex = random.nextInt(rackList.size()); |
| int maxRackSize = 0; |
| for (Map.Entry<String,List<ServerName>> r : rackToRegionServerMap.entrySet()) { |
| if (r.getValue().size() > maxRackSize) { |
| maxRackSize = r.getValue().size(); |
| } |
| } |
| int numIterations = 0; |
| // Initialize the current processing host index. |
| int serverIndex = random.nextInt(maxRackSize); |
| for (RegionInfo regionInfo : regions) { |
| List<ServerName> currentServerList; |
| String rackName; |
| while (true) { |
| rackName = rackList.get(rackIndex); |
| numIterations++; |
| // Get the server list for the current rack |
| currentServerList = rackToRegionServerMap.get(rackName); |
| |
| if (serverIndex >= currentServerList.size()) { //not enough machines in this rack |
| if (numIterations % rackList.size() == 0) { |
| if (++serverIndex >= maxRackSize) serverIndex = 0; |
| } |
| if ((++rackIndex) >= rackList.size()) { |
| rackIndex = 0; // reset the rack index to 0 |
| } |
| } else break; |
| } |
| |
| // Get the current process region server |
| ServerName currentServer = currentServerList.get(serverIndex); |
| |
| // Place the current region with the current primary region server |
| primaryRSMap.put(regionInfo, currentServer); |
| if (assignmentMap != null) { |
| List<RegionInfo> regionsForServer = assignmentMap.get(currentServer); |
| if (regionsForServer == null) { |
| regionsForServer = new ArrayList<>(); |
| assignmentMap.put(currentServer, regionsForServer); |
| } |
| regionsForServer.add(regionInfo); |
| } |
| |
| // Set the next processing index |
| if (numIterations % rackList.size() == 0) { |
| ++serverIndex; |
| } |
| if ((++rackIndex) >= rackList.size()) { |
| rackIndex = 0; // reset the rack index to 0 |
| } |
| } |
| } |
| |
| public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryRS( |
| Map<RegionInfo, ServerName> primaryRSMap) { |
| Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); |
| for (Map.Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { |
| // Get the target region and its primary region server rack |
| RegionInfo regionInfo = entry.getKey(); |
| ServerName primaryRS = entry.getValue(); |
| try { |
| // Create the secondary and tertiary region server pair object. |
| ServerName[] favoredNodes = getSecondaryAndTertiary(regionInfo, primaryRS); |
| if (favoredNodes != null) { |
| secondaryAndTertiaryMap.put(regionInfo, favoredNodes); |
| LOG.debug("Place the secondary and tertiary region server for region " |
| + regionInfo.getRegionNameAsString()); |
| } |
| } catch (Exception e) { |
| LOG.warn("Cannot place the favored nodes for region " + |
| regionInfo.getRegionNameAsString() + " because " + e, e); |
| continue; |
| } |
| } |
| return secondaryAndTertiaryMap; |
| } |
| |
| public ServerName[] getSecondaryAndTertiary(RegionInfo regionInfo, ServerName primaryRS) |
| throws IOException { |
| |
| ServerName[] favoredNodes;// Get the rack for the primary region server |
| String primaryRack = getRackOfServer(primaryRS); |
| |
| if (getTotalNumberOfRacks() == 1) { |
| favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); |
| } else { |
| favoredNodes = multiRackCase(regionInfo, primaryRS, primaryRack); |
| } |
| return favoredNodes; |
| } |
| |
| private Map<ServerName, Set<RegionInfo>> mapRSToPrimaries( |
| Map<RegionInfo, ServerName> primaryRSMap) { |
| Map<ServerName, Set<RegionInfo>> primaryServerMap = new HashMap<>(); |
| for (Entry<RegionInfo, ServerName> e : primaryRSMap.entrySet()) { |
| Set<RegionInfo> currentSet = primaryServerMap.get(e.getValue()); |
| if (currentSet == null) { |
| currentSet = new HashSet<>(); |
| } |
| currentSet.add(e.getKey()); |
| primaryServerMap.put(e.getValue(), currentSet); |
| } |
| return primaryServerMap; |
| } |
| |
| /** |
| * For regions that share the primary, avoid placing the secondary and tertiary |
| * on a same RS. Used for generating new assignments for the |
| * primary/secondary/tertiary RegionServers |
| * @param primaryRSMap |
| * @return the map of regions to the servers the region-files should be hosted on |
| */ |
| public Map<RegionInfo, ServerName[]> placeSecondaryAndTertiaryWithRestrictions( |
| Map<RegionInfo, ServerName> primaryRSMap) { |
| Map<ServerName, Set<RegionInfo>> serverToPrimaries = |
| mapRSToPrimaries(primaryRSMap); |
| Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap = new HashMap<>(); |
| |
| for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { |
| // Get the target region and its primary region server rack |
| RegionInfo regionInfo = entry.getKey(); |
| ServerName primaryRS = entry.getValue(); |
| try { |
| // Get the rack for the primary region server |
| String primaryRack = getRackOfServer(primaryRS); |
| ServerName[] favoredNodes = null; |
| if (getTotalNumberOfRacks() == 1) { |
| // Single rack case: have to pick the secondary and tertiary |
| // from the same rack |
| favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack); |
| } else { |
| favoredNodes = multiRackCaseWithRestrictions(serverToPrimaries, |
| secondaryAndTertiaryMap, primaryRack, primaryRS, regionInfo); |
| } |
| if (favoredNodes != null) { |
| secondaryAndTertiaryMap.put(regionInfo, favoredNodes); |
| LOG.debug("Place the secondary and tertiary region server for region " |
| + regionInfo.getRegionNameAsString()); |
| } |
| } catch (Exception e) { |
| LOG.warn("Cannot place the favored nodes for region " |
| + regionInfo.getRegionNameAsString() + " because " + e, e); |
| continue; |
| } |
| } |
| return secondaryAndTertiaryMap; |
| } |
| |
| private ServerName[] multiRackCaseWithRestrictions( |
| Map<ServerName, Set<RegionInfo>> serverToPrimaries, |
| Map<RegionInfo, ServerName[]> secondaryAndTertiaryMap, |
| String primaryRack, ServerName primaryRS, RegionInfo regionInfo) throws IOException { |
| // Random to choose the secondary and tertiary region server |
| // from another rack to place the secondary and tertiary |
| // Random to choose one rack except for the current rack |
| Set<String> rackSkipSet = new HashSet<>(); |
| rackSkipSet.add(primaryRack); |
| String secondaryRack = getOneRandomRack(rackSkipSet); |
| List<ServerName> serverList = getServersFromRack(secondaryRack); |
| Set<ServerName> serverSet = new HashSet<>(serverList); |
| ServerName[] favoredNodes; |
| if (serverList.size() >= 2) { |
| // Randomly pick up two servers from this secondary rack |
| // Skip the secondary for the tertiary placement |
| // skip the servers which share the primary already |
| Set<RegionInfo> primaries = serverToPrimaries.get(primaryRS); |
| Set<ServerName> skipServerSet = new HashSet<>(); |
| while (true) { |
| ServerName[] secondaryAndTertiary = null; |
| if (primaries.size() > 1) { |
| // check where his tertiary and secondary are |
| for (RegionInfo primary : primaries) { |
| secondaryAndTertiary = secondaryAndTertiaryMap.get(primary); |
| if (secondaryAndTertiary != null) { |
| if (getRackOfServer(secondaryAndTertiary[0]).equals(secondaryRack)) { |
| skipServerSet.add(secondaryAndTertiary[0]); |
| } |
| if (getRackOfServer(secondaryAndTertiary[1]).equals(secondaryRack)) { |
| skipServerSet.add(secondaryAndTertiary[1]); |
| } |
| } |
| } |
| } |
| if (skipServerSet.size() + 2 <= serverSet.size()) |
| break; |
| skipServerSet.clear(); |
| rackSkipSet.add(secondaryRack); |
| // we used all racks |
| if (rackSkipSet.size() == getTotalNumberOfRacks()) { |
| // remove the last two added and break |
| skipServerSet.remove(secondaryAndTertiary[0]); |
| skipServerSet.remove(secondaryAndTertiary[1]); |
| break; |
| } |
| secondaryRack = getOneRandomRack(rackSkipSet); |
| serverList = getServersFromRack(secondaryRack); |
| serverSet = new HashSet<>(serverList); |
| } |
| |
| // Place the secondary RS |
| ServerName secondaryRS = getOneRandomServer(secondaryRack, skipServerSet); |
| skipServerSet.add(secondaryRS); |
| // Place the tertiary RS |
| ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet); |
| |
| if (secondaryRS == null || tertiaryRS == null) { |
| LOG.error("Cannot place the secondary and tertiary" |
| + " region server for region " |
| + regionInfo.getRegionNameAsString()); |
| } |
| // Create the secondary and tertiary pair |
| favoredNodes = new ServerName[2]; |
| favoredNodes[0] = secondaryRS; |
| favoredNodes[1] = tertiaryRS; |
| } else { |
| // Pick the secondary rs from this secondary rack |
| // and pick the tertiary from another random rack |
| favoredNodes = new ServerName[2]; |
| ServerName secondary = getOneRandomServer(secondaryRack); |
| favoredNodes[0] = secondary; |
| |
| // Pick the tertiary |
| if (getTotalNumberOfRacks() == 2) { |
| // Pick the tertiary from the same rack of the primary RS |
| Set<ServerName> serverSkipSet = new HashSet<>(); |
| serverSkipSet.add(primaryRS); |
| favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet); |
| } else { |
| // Pick the tertiary from another rack |
| rackSkipSet.add(secondaryRack); |
| String tertiaryRandomRack = getOneRandomRack(rackSkipSet); |
| favoredNodes[1] = getOneRandomServer(tertiaryRandomRack); |
| } |
| } |
| return favoredNodes; |
| } |
| |
| private ServerName[] singleRackCase(RegionInfo regionInfo, |
| ServerName primaryRS, |
| String primaryRack) throws IOException { |
| // Single rack case: have to pick the secondary and tertiary |
| // from the same rack |
| List<ServerName> serverList = getServersFromRack(primaryRack); |
| if ((serverList == null) || (serverList.size() <= 2)) { |
| // Single region server case: cannot not place the favored nodes |
| // on any server; |
| return null; |
| } else { |
| // Randomly select two region servers from the server list and make sure |
| // they are not overlap with the primary region server; |
| Set<ServerName> serverSkipSet = new HashSet<>(); |
| serverSkipSet.add(primaryRS); |
| |
| // Place the secondary RS |
| ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet); |
| // Skip the secondary for the tertiary placement |
| serverSkipSet.add(secondaryRS); |
| ServerName tertiaryRS = getOneRandomServer(primaryRack, serverSkipSet); |
| |
| if (secondaryRS == null || tertiaryRS == null) { |
| LOG.error("Cannot place the secondary, tertiary favored node for region " + |
| regionInfo.getRegionNameAsString()); |
| } |
| // Create the secondary and tertiary pair |
| ServerName[] favoredNodes = new ServerName[2]; |
| favoredNodes[0] = secondaryRS; |
| favoredNodes[1] = tertiaryRS; |
| return favoredNodes; |
| } |
| } |
| |
| /** |
| * Place secondary and tertiary nodes in a multi rack case. |
| * If there are only two racks, then we try the place the secondary |
| * and tertiary on different rack than primary. But if the other rack has |
| * only one region server, then we place primary and tertiary on one rack |
| * and secondary on another. The aim is two distribute the three favored nodes |
| * on >= 2 racks. |
| * TODO: see how we can use generateMissingFavoredNodeMultiRack API here |
| * @param regionInfo Region for which we are trying to generate FN |
| * @param primaryRS The primary favored node. |
| * @param primaryRack The rack of the primary favored node. |
| * @return Array containing secondary and tertiary favored nodes. |
| * @throws IOException Signals that an I/O exception has occurred. |
| */ |
| private ServerName[] multiRackCase(RegionInfo regionInfo, ServerName primaryRS, |
| String primaryRack) throws IOException { |
| |
| List<ServerName>favoredNodes = Lists.newArrayList(primaryRS); |
| // Create the secondary and tertiary pair |
| ServerName secondaryRS = generateMissingFavoredNodeMultiRack(favoredNodes); |
| favoredNodes.add(secondaryRS); |
| String secondaryRack = getRackOfServer(secondaryRS); |
| |
| ServerName tertiaryRS; |
| if (primaryRack.equals(secondaryRack)) { |
| tertiaryRS = generateMissingFavoredNode(favoredNodes); |
| } else { |
| // Try to place tertiary in secondary RS rack else place on primary rack. |
| tertiaryRS = getOneRandomServer(secondaryRack, Sets.newHashSet(secondaryRS)); |
| if (tertiaryRS == null) { |
| tertiaryRS = getOneRandomServer(primaryRack, Sets.newHashSet(primaryRS)); |
| } |
| // We couldn't find anything in secondary rack, get any FN |
| if (tertiaryRS == null) { |
| tertiaryRS = generateMissingFavoredNode(Lists.newArrayList(primaryRS, secondaryRS)); |
| } |
| } |
| return new ServerName[]{ secondaryRS, tertiaryRS }; |
| } |
| |
| public boolean canPlaceFavoredNodes() { |
| return (this.servers.size() >= FAVORED_NODES_NUM); |
| } |
| |
| private int getTotalNumberOfRacks() { |
| return this.uniqueRackList.size(); |
| } |
| |
| private List<ServerName> getServersFromRack(String rack) { |
| return this.rackToRegionServerMap.get(rack); |
| } |
| |
| /** |
| * Gets a random server from the specified rack and skips anything specified. |
| |
| * @param rack rack from a server is needed |
| * @param skipServerSet the server shouldn't belong to this set |
| */ |
| protected ServerName getOneRandomServer(String rack, Set<ServerName> skipServerSet) { |
| |
| // Is the rack valid? Do we recognize it? |
| if (rack == null || getServersFromRack(rack) == null || |
| getServersFromRack(rack).isEmpty()) { |
| return null; |
| } |
| |
| // Lets use a set so we can eliminate duplicates |
| Set<StartcodeAgnosticServerName> serversToChooseFrom = Sets.newHashSet(); |
| for (ServerName sn : getServersFromRack(rack)) { |
| serversToChooseFrom.add(StartcodeAgnosticServerName.valueOf(sn)); |
| } |
| |
| if (skipServerSet != null && skipServerSet.size() > 0) { |
| for (ServerName sn : skipServerSet) { |
| serversToChooseFrom.remove(StartcodeAgnosticServerName.valueOf(sn)); |
| } |
| // Do we have any servers left to choose from? |
| if (serversToChooseFrom.isEmpty()) { |
| return null; |
| } |
| } |
| |
| ServerName randomServer = null; |
| int randomIndex = random.nextInt(serversToChooseFrom.size()); |
| int j = 0; |
| for (StartcodeAgnosticServerName sn : serversToChooseFrom) { |
| if (j == randomIndex) { |
| randomServer = sn; |
| break; |
| } |
| j++; |
| } |
| |
| if (randomServer != null) { |
| return ServerName.valueOf(randomServer.getHostAndPort(), randomServer.getStartcode()); |
| } else { |
| return null; |
| } |
| } |
| |
| private ServerName getOneRandomServer(String rack) throws IOException { |
| return this.getOneRandomServer(rack, null); |
| } |
| |
| protected String getOneRandomRack(Set<String> skipRackSet) throws IOException { |
| if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) { |
| throw new IOException("Cannot randomly pick another random server"); |
| } |
| |
| String randomRack; |
| do { |
| int randomIndex = random.nextInt(this.uniqueRackList.size()); |
| randomRack = this.uniqueRackList.get(randomIndex); |
| } while (skipRackSet.contains(randomRack)); |
| |
| return randomRack; |
| } |
| |
| public static String getFavoredNodesAsString(List<ServerName> nodes) { |
| StringBuilder strBuf = new StringBuilder(); |
| int i = 0; |
| for (ServerName node : nodes) { |
| strBuf.append(node.getHostAndPort()); |
| if (++i != nodes.size()) strBuf.append(";"); |
| } |
| return strBuf.toString(); |
| } |
| |
| /* |
| * Generates a missing favored node based on the input favored nodes. This helps to generate |
| * new FN when there is already 2 FN and we need a third one. For eg, while generating new FN |
| * for split daughters after inheriting 2 FN from the parent. If the cluster has only one rack |
| * it generates from the same rack. If the cluster has multiple racks, then it ensures the new |
| * FN respects the rack constraints similar to HDFS. For eg: if there are 3 FN, they will be |
| * spread across 2 racks. |
| */ |
| public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes) throws IOException { |
| if (this.uniqueRackList.size() == 1) { |
| return generateMissingFavoredNodeSingleRack(favoredNodes, null); |
| } else { |
| return generateMissingFavoredNodeMultiRack(favoredNodes, null); |
| } |
| } |
| |
| public ServerName generateMissingFavoredNode(List<ServerName> favoredNodes, |
| List<ServerName> excludeNodes) throws IOException { |
| if (this.uniqueRackList.size() == 1) { |
| return generateMissingFavoredNodeSingleRack(favoredNodes, excludeNodes); |
| } else { |
| return generateMissingFavoredNodeMultiRack(favoredNodes, excludeNodes); |
| } |
| } |
| |
| /* |
| * Generate FN for a single rack scenario, don't generate from one of the excluded nodes. Helps |
| * when we would like to find a replacement node. |
| */ |
| private ServerName generateMissingFavoredNodeSingleRack(List<ServerName> favoredNodes, |
| List<ServerName> excludeNodes) throws IOException { |
| ServerName newServer = null; |
| Set<ServerName> excludeFNSet = Sets.newHashSet(favoredNodes); |
| if (excludeNodes != null && excludeNodes.size() > 0) { |
| excludeFNSet.addAll(excludeNodes); |
| } |
| if (favoredNodes.size() < FAVORED_NODES_NUM) { |
| newServer = this.getOneRandomServer(this.uniqueRackList.get(0), excludeFNSet); |
| } |
| return newServer; |
| } |
| |
| private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes) |
| throws IOException { |
| return generateMissingFavoredNodeMultiRack(favoredNodes, null); |
| } |
| |
| /* |
| * Generates a missing FN based on the input favoredNodes and also the nodes to be skipped. |
| * |
| * Get the current layout of favored nodes arrangement and nodes to be excluded and get a |
| * random node that goes with HDFS block placement. Eg: If the existing nodes are on one rack, |
| * generate one from another rack. We exclude as much as possible so the random selection |
| * has more chance to generate a node within a few iterations, ideally 1. |
| */ |
| private ServerName generateMissingFavoredNodeMultiRack(List<ServerName> favoredNodes, |
| List<ServerName> excludeNodes) throws IOException { |
| |
| Set<String> racks = Sets.newHashSet(); |
| Map<String, Set<ServerName>> rackToFNMapping = new HashMap<>(); |
| |
| // Lets understand the current rack distribution of the FN |
| for (ServerName sn : favoredNodes) { |
| String rack = getRackOfServer(sn); |
| racks.add(rack); |
| |
| Set<ServerName> serversInRack = rackToFNMapping.get(rack); |
| if (serversInRack == null) { |
| serversInRack = Sets.newHashSet(); |
| rackToFNMapping.put(rack, serversInRack); |
| } |
| serversInRack.add(sn); |
| } |
| |
| // What racks should be skipped while getting a FN? |
| Set<String> skipRackSet = Sets.newHashSet(); |
| |
| /* |
| * If both the FN are from the same rack, then we don't want to generate another FN on the |
| * same rack. If that rack fails, the region would be unavailable. |
| */ |
| if (racks.size() == 1 && favoredNodes.size() > 1) { |
| skipRackSet.add(racks.iterator().next()); |
| } |
| |
| /* |
| * If there are no free nodes on the existing racks, we should skip those racks too. We can |
| * reduce the number of iterations for FN selection. |
| */ |
| for (String rack : racks) { |
| if (getServersFromRack(rack) != null && |
| rackToFNMapping.get(rack).size() == getServersFromRack(rack).size()) { |
| skipRackSet.add(rack); |
| } |
| } |
| |
| Set<ServerName> favoredNodeSet = Sets.newHashSet(favoredNodes); |
| if (excludeNodes != null && excludeNodes.size() > 0) { |
| favoredNodeSet.addAll(excludeNodes); |
| } |
| |
| /* |
| * Lets get a random rack by excluding skipRackSet and generate a random FN from that rack. |
| */ |
| int i = 0; |
| Set<String> randomRacks = Sets.newHashSet(); |
| ServerName newServer = null; |
| do { |
| String randomRack = this.getOneRandomRack(skipRackSet); |
| newServer = this.getOneRandomServer(randomRack, favoredNodeSet); |
| randomRacks.add(randomRack); |
| i++; |
| } while ((i < MAX_ATTEMPTS_FN_GENERATION) && (newServer == null)); |
| |
| if (newServer == null) { |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(String.format("Unable to generate additional favored nodes for %s after " |
| + "considering racks %s and skip rack %s with a unique rack list of %s and rack " |
| + "to RS map of %s and RS to rack map of %s", |
| StringUtils.join(favoredNodes, ","), randomRacks, skipRackSet, uniqueRackList, |
| rackToRegionServerMap, regionServerToRackMap)); |
| } |
| throw new IOException(" Unable to generate additional favored nodes for " |
| + StringUtils.join(favoredNodes, ",")); |
| } |
| return newServer; |
| } |
| |
| /* |
| * Generate favored nodes for a region. |
| * |
| * Choose a random server as primary and then choose secondary and tertiary FN so its spread |
| * across two racks. |
| */ |
| public List<ServerName> generateFavoredNodes(RegionInfo hri) throws IOException { |
| |
| List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); |
| ServerName primary = servers.get(random.nextInt(servers.size())); |
| favoredNodesForRegion.add(ServerName.valueOf(primary.getHostAndPort(), ServerName.NON_STARTCODE)); |
| |
| Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(1); |
| primaryRSMap.put(hri, primary); |
| Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = |
| placeSecondaryAndTertiaryRS(primaryRSMap); |
| ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(hri); |
| if (secondaryAndTertiaryNodes != null && secondaryAndTertiaryNodes.length == 2) { |
| for (ServerName sn : secondaryAndTertiaryNodes) { |
| favoredNodesForRegion.add(ServerName.valueOf(sn.getHostAndPort(), ServerName.NON_STARTCODE)); |
| } |
| return favoredNodesForRegion; |
| } else { |
| throw new HBaseIOException("Unable to generate secondary and tertiary favored nodes."); |
| } |
| } |
| |
| public Map<RegionInfo, List<ServerName>> generateFavoredNodesRoundRobin( |
| Map<ServerName, List<RegionInfo>> assignmentMap, List<RegionInfo> regions) |
| throws IOException { |
| |
| if (regions.size() > 0) { |
| if (canPlaceFavoredNodes()) { |
| Map<RegionInfo, ServerName> primaryRSMap = new HashMap<>(); |
| // Lets try to have an equal distribution for primary favored node |
| placePrimaryRSAsRoundRobin(assignmentMap, primaryRSMap, regions); |
| return generateFavoredNodes(primaryRSMap); |
| |
| } else { |
| throw new HBaseIOException("Not enough nodes to generate favored nodes"); |
| } |
| } |
| return null; |
| } |
| |
| /* |
| * Generate favored nodes for a set of regions when we know where they are currently hosted. |
| */ |
| private Map<RegionInfo, List<ServerName>> generateFavoredNodes( |
| Map<RegionInfo, ServerName> primaryRSMap) { |
| |
| Map<RegionInfo, List<ServerName>> generatedFavNodes = new HashMap<>(); |
| Map<RegionInfo, ServerName[]> secondaryAndTertiaryRSMap = |
| placeSecondaryAndTertiaryRS(primaryRSMap); |
| |
| for (Entry<RegionInfo, ServerName> entry : primaryRSMap.entrySet()) { |
| List<ServerName> favoredNodesForRegion = new ArrayList<>(FAVORED_NODES_NUM); |
| RegionInfo region = entry.getKey(); |
| ServerName primarySN = entry.getValue(); |
| favoredNodesForRegion.add(ServerName.valueOf(primarySN.getHostname(), primarySN.getPort(), |
| NON_STARTCODE)); |
| ServerName[] secondaryAndTertiaryNodes = secondaryAndTertiaryRSMap.get(region); |
| if (secondaryAndTertiaryNodes != null) { |
| favoredNodesForRegion.add(ServerName.valueOf( |
| secondaryAndTertiaryNodes[0].getHostname(), secondaryAndTertiaryNodes[0].getPort(), |
| NON_STARTCODE)); |
| favoredNodesForRegion.add(ServerName.valueOf( |
| secondaryAndTertiaryNodes[1].getHostname(), secondaryAndTertiaryNodes[1].getPort(), |
| NON_STARTCODE)); |
| } |
| generatedFavNodes.put(region, favoredNodesForRegion); |
| } |
| return generatedFavNodes; |
| } |
| |
| /* |
| * Get the rack of server from local mapping when present, saves lookup by the RackManager. |
| */ |
| private String getRackOfServer(ServerName sn) { |
| if (this.regionServerToRackMap.containsKey(sn.getHostname())) { |
| return this.regionServerToRackMap.get(sn.getHostname()); |
| } else { |
| String rack = this.rackManager.getRack(sn); |
| this.regionServerToRackMap.put(sn.getHostname(), rack); |
| return rack; |
| } |
| } |
| } |