blob: 6d49459b739f1652307857ffa5e4098366f5c60b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.hadoop.hdds.scm.container.placement.algorithms;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.scm.net.NetConstants;
import org.apache.hadoop.hdds.scm.net.NetworkTopology;
import org.apache.hadoop.hdds.scm.net.Node;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Container placement policy that choose datanodes with network topology
* awareness, together with the space to satisfy the size constraints.
* <p>
* This placement policy complies with the algorithm used in HDFS. With default
* 3 replica, two replica will be on the same rack, the third one will on a
* different rack.
* <p>
* This implementation applies to network topology like "/rack/node". Don't
* recommend to use this if the network topology has more layers.
* <p>
*/
public final class SCMContainerPlacementRackAware extends SCMCommonPolicy {
@VisibleForTesting
static final Logger LOG =
LoggerFactory.getLogger(SCMContainerPlacementRackAware.class);
private final NetworkTopology networkTopology;
private boolean fallback;
private static final int RACK_LEVEL = 1;
private static final int MAX_RETRY= 3;
private final SCMContainerPlacementMetrics metrics;
/**
* Constructs a Container Placement with rack awareness.
*
* @param nodeManager Node Manager
* @param conf Configuration
* @param fallback Whether reducing constrains to choose a data node when
* there is no node which satisfy all constrains.
* Basically, false for open container placement, and true
* for closed container placement.
*/
public SCMContainerPlacementRackAware(final NodeManager nodeManager,
final Configuration conf, final NetworkTopology networkTopology,
final boolean fallback, final SCMContainerPlacementMetrics metrics) {
super(nodeManager, conf);
this.networkTopology = networkTopology;
this.fallback = fallback;
this.metrics = metrics;
}
/**
* Called by SCM to choose datanodes.
* There are two scenarios, one is choosing all nodes for a new pipeline.
* Another is choosing node to meet replication requirement.
*
*
* @param excludedNodes - list of the datanodes to exclude.
* @param favoredNodes - list of nodes preferred. This is a hint to the
* allocator, whether the favored nodes will be used
* depends on whether the nodes meets the allocator's
* requirement.
* @param nodesRequired - number of datanodes required.
* @param sizeRequired - size required for the container or block.
* @return List of datanodes.
* @throws SCMException SCMException
*/
@Override
public List<DatanodeDetails> chooseDatanodes(
List<DatanodeDetails> excludedNodes, List<DatanodeDetails> favoredNodes,
int nodesRequired, final long sizeRequired) throws SCMException {
Preconditions.checkArgument(nodesRequired > 0);
metrics.incrDatanodeRequestCount(nodesRequired);
int datanodeCount = networkTopology.getNumOfLeafNode(NetConstants.ROOT);
int excludedNodesCount = excludedNodes == null ? 0 : excludedNodes.size();
if (datanodeCount < nodesRequired + excludedNodesCount) {
throw new SCMException("No enough datanodes to choose. " +
"TotalNode = " + datanodeCount +
"RequiredNode = " + nodesRequired +
"ExcludedNode = " + excludedNodesCount, null);
}
List<DatanodeDetails> mutableFavoredNodes = favoredNodes;
// sanity check of favoredNodes
if (mutableFavoredNodes != null && excludedNodes != null) {
mutableFavoredNodes = new ArrayList<>();
mutableFavoredNodes.addAll(favoredNodes);
mutableFavoredNodes.removeAll(excludedNodes);
}
int favoredNodeNum = mutableFavoredNodes == null? 0 :
mutableFavoredNodes.size();
List<Node> chosenNodes = new ArrayList<>();
int favorIndex = 0;
if (excludedNodes == null || excludedNodes.isEmpty()) {
// choose all nodes for a new pipeline case
// choose first datanode from scope ROOT or from favoredNodes if not null
Node favoredNode = favoredNodeNum > favorIndex ?
mutableFavoredNodes.get(favorIndex) : null;
Node firstNode;
if (favoredNode != null) {
firstNode = favoredNode;
favorIndex++;
} else {
firstNode = chooseNode(null, null, sizeRequired);
}
chosenNodes.add(firstNode);
nodesRequired--;
if (nodesRequired == 0) {
return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
}
// choose second datanode on the same rack as first one
favoredNode = favoredNodeNum > favorIndex ?
mutableFavoredNodes.get(favorIndex) : null;
Node secondNode;
if (favoredNode != null &&
networkTopology.isSameParent(firstNode, favoredNode)) {
secondNode = favoredNode;
favorIndex++;
} else {
secondNode = chooseNode(chosenNodes, firstNode, sizeRequired);
}
chosenNodes.add(secondNode);
nodesRequired--;
if (nodesRequired == 0) {
return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
}
// choose remaining datanodes on different rack as first and second
return chooseNodes(null, chosenNodes, mutableFavoredNodes, favorIndex,
nodesRequired, sizeRequired);
} else {
List<Node> mutableExcludedNodes = new ArrayList<>();
mutableExcludedNodes.addAll(excludedNodes);
// choose node to meet replication requirement
// case 1: one excluded node, choose one on the same rack as the excluded
// node, choose others on different racks.
Node favoredNode;
if (excludedNodes.size() == 1) {
favoredNode = favoredNodeNum > favorIndex ?
mutableFavoredNodes.get(favorIndex) : null;
Node firstNode;
if (favoredNode != null &&
networkTopology.isSameParent(excludedNodes.get(0), favoredNode)) {
firstNode = favoredNode;
favorIndex++;
} else {
firstNode = chooseNode(mutableExcludedNodes, excludedNodes.get(0),
sizeRequired);
}
chosenNodes.add(firstNode);
nodesRequired--;
if (nodesRequired == 0) {
return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
}
// choose remaining nodes on different racks
return chooseNodes(null, chosenNodes, mutableFavoredNodes, favorIndex,
nodesRequired, sizeRequired);
}
// case 2: two or more excluded nodes, if these two nodes are
// in the same rack, then choose nodes on different racks, otherwise,
// choose one on the same rack as one of excluded nodes, remaining chosen
// are on different racks.
for(int i = 0; i < excludedNodesCount; i++) {
for (int j = i + 1; j < excludedNodesCount; j++) {
if (networkTopology.isSameParent(
excludedNodes.get(i), excludedNodes.get(j))) {
// choose remaining nodes on different racks
return chooseNodes(mutableExcludedNodes, chosenNodes,
mutableFavoredNodes, favorIndex, nodesRequired, sizeRequired);
}
}
}
// choose one data on the same rack with one excluded node
favoredNode = favoredNodeNum > favorIndex ?
mutableFavoredNodes.get(favorIndex) : null;
Node secondNode;
if (favoredNode != null && networkTopology.isSameParent(
mutableExcludedNodes.get(0), favoredNode)) {
secondNode = favoredNode;
favorIndex++;
} else {
secondNode =
chooseNode(chosenNodes, mutableExcludedNodes.get(0), sizeRequired);
}
chosenNodes.add(secondNode);
mutableExcludedNodes.add(secondNode);
nodesRequired--;
if (nodesRequired == 0) {
return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
}
// choose remaining nodes on different racks
return chooseNodes(mutableExcludedNodes, chosenNodes, mutableFavoredNodes,
favorIndex, nodesRequired, sizeRequired);
}
}
@Override
public DatanodeDetails chooseNode(List<DatanodeDetails> healthyNodes) {
return null;
}
/**
* Choose a datanode which meets the requirements. If there is no node which
* meets all the requirements, there is fallback chosen process depending on
* whether fallback is allowed when this class is instantiated.
*
*
* @param excludedNodes - list of the datanodes to excluded. Can be null.
* @param affinityNode - the chosen nodes should be on the same rack as
* affinityNode. Can be null.
* @param sizeRequired - size required for the container or block.
* @return List of chosen datanodes.
* @throws SCMException SCMException
*/
private Node chooseNode(List<Node> excludedNodes, Node affinityNode,
long sizeRequired) throws SCMException {
int ancestorGen = RACK_LEVEL;
int maxRetry = MAX_RETRY;
List<String> excludedNodesForCapacity = null;
boolean isFallbacked = false;
while(true) {
metrics.incrDatanodeChooseAttemptCount();
Node node = networkTopology.chooseRandom(NetConstants.ROOT,
excludedNodesForCapacity, excludedNodes, affinityNode, ancestorGen);
if (node == null) {
// cannot find the node which meets all constrains
LOG.warn("Failed to find the datanode for container. excludedNodes:" +
(excludedNodes == null ? "" : excludedNodes.toString()) +
", affinityNode:" +
(affinityNode == null ? "" : affinityNode.getNetworkFullPath()));
if (fallback) {
isFallbacked = true;
// fallback, don't consider the affinity node
if (affinityNode != null) {
affinityNode = null;
continue;
}
// fallback, don't consider cross rack
if (ancestorGen == RACK_LEVEL) {
ancestorGen--;
continue;
}
}
// there is no constrains to reduce or fallback is true
throw new SCMException("No satisfied datanode to meet the" +
" excludedNodes and affinityNode constrains.", null);
}
if (hasEnoughSpace((DatanodeDetails)node, sizeRequired)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Datanode {} is chosen for container. Required size is {}",
node.toString(), sizeRequired);
}
metrics.incrDatanodeChooseSuccessCount();
if (isFallbacked) {
metrics.incrDatanodeChooseFallbackCount();
}
return node;
} else {
maxRetry--;
if (maxRetry == 0) {
// avoid the infinite loop
String errMsg = "No satisfied datanode to meet the space constrains. "
+ " sizeRequired: " + sizeRequired;
LOG.info(errMsg);
throw new SCMException(errMsg, null);
}
if (excludedNodesForCapacity == null) {
excludedNodesForCapacity = new ArrayList<>();
}
excludedNodesForCapacity.add(node.getNetworkFullPath());
}
}
}
/**
* Choose a batch of datanodes on different rack than excludedNodes or
* chosenNodes.
*
*
* @param excludedNodes - list of the datanodes to excluded. Can be null.
* @param chosenNodes - list of nodes already chosen. These nodes should also
* be excluded. Cannot be null.
* @param favoredNodes - list of favoredNodes. It's a hint. Whether the nodes
* are chosen depends on whether they meet the constrains.
* Can be null.
* @param favorIndex - the node index of favoredNodes which is not chosen yet.
* @param sizeRequired - size required for the container or block.
* @param nodesRequired - number of datanodes required.
* @param sizeRequired - size required for the container or block.
* @return List of chosen datanodes.
* @throws SCMException SCMException
*/
private List<DatanodeDetails> chooseNodes(List<Node> excludedNodes,
List<Node> chosenNodes, List<DatanodeDetails> favoredNodes,
int favorIndex, int nodesRequired, long sizeRequired)
throws SCMException {
Preconditions.checkArgument(chosenNodes != null);
List<Node> excludedNodeList = excludedNodes != null ?
excludedNodes : chosenNodes;
int favoredNodeNum = favoredNodes == null? 0 : favoredNodes.size();
while(true) {
Node favoredNode = favoredNodeNum > favorIndex ?
favoredNodes.get(favorIndex) : null;
Node chosenNode;
if (favoredNode != null && networkTopology.isSameParent(
excludedNodeList.get(excludedNodeList.size() - 1), favoredNode)) {
chosenNode = favoredNode;
favorIndex++;
} else {
chosenNode = chooseNode(excludedNodeList, null, sizeRequired);
}
excludedNodeList.add(chosenNode);
if (excludedNodeList != chosenNodes) {
chosenNodes.add(chosenNode);
}
nodesRequired--;
if (nodesRequired == 0) {
return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
}
}
}
}