blob: 85d281cf6dc2cd758605542dd0d5f5c3d47437df [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.hadoop.hdds.scm.container.placement.algorithms;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeMetric;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.scm.net.NetworkTopology;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Container placement policy that randomly choose datanodes with remaining
* space to satisfy the size constraints.
* <p>
* The Algorithm is as follows, Pick 2 random nodes from a given pool of nodes
* and then pick the node which lower utilization. This leads to a higher
* probability of nodes with lower utilization to be picked.
* <p>
* For those wondering why we choose two nodes randomly and choose the node
* with lower utilization. There are links to this original papers in
* HDFS-11564.
* <p>
* A brief summary -- We treat the nodes from a scale of lowest utilized to
* highest utilized, there are (s * ( s + 1)) / 2 possibilities to build
* distinct pairs of nodes. There are s - k pairs of nodes in which the rank
* k node is less than the couple. So probability of a picking a node is
* (2 * (s -k)) / (s * (s - 1)).
* <p>
* In English, There is a much higher probability of picking less utilized nodes
* as compared to nodes with higher utilization since we pick 2 nodes and
* then pick the node with lower utilization.
* <p>
* This avoids the issue of users adding new nodes into the cluster and HDFS
* sending all traffic to those nodes if we only use a capacity based
* allocation scheme. Unless those nodes are part of the set of the first 2
* nodes then newer nodes will not be in the running to get the container.
* <p>
* This leads to an I/O pattern where the lower utilized nodes are favoured
* more than higher utilized nodes, but part of the I/O will still go to the
* older higher utilized nodes.
* <p>
* With this algorithm in place, our hope is that balancer tool needs to do
* little or no work and the cluster will achieve a balanced distribution
* over time.
*/
public final class SCMContainerPlacementCapacity extends SCMCommonPolicy {
@VisibleForTesting
static final Logger LOG =
LoggerFactory.getLogger(SCMContainerPlacementCapacity.class);
/**
* Constructs a Container Placement with considering only capacity.
* That is this policy tries to place containers based on node weight.
*
* @param nodeManager Node Manager
* @param conf Configuration
*/
public SCMContainerPlacementCapacity(final NodeManager nodeManager,
final Configuration conf, final NetworkTopology networkTopology,
final boolean fallback, final SCMContainerPlacementMetrics metrics) {
super(nodeManager, conf);
}
/**
* Called by SCM to choose datanodes.
*
*
* @param excludedNodes - list of the datanodes to exclude.
* @param favoredNodes - list of nodes preferred.
* @param nodesRequired - number of datanodes required.
* @param sizeRequired - size required for the container or block.
* @return List of datanodes.
* @throws SCMException SCMException
*/
@Override
public List<DatanodeDetails> chooseDatanodes(
List<DatanodeDetails> excludedNodes, List<DatanodeDetails> favoredNodes,
final int nodesRequired, final long sizeRequired) throws SCMException {
List<DatanodeDetails> healthyNodes = super.chooseDatanodes(excludedNodes,
favoredNodes, nodesRequired, sizeRequired);
if (healthyNodes.size() == nodesRequired) {
return healthyNodes;
}
return getResultSet(nodesRequired, healthyNodes);
}
/**
* Find a node from the healthy list and return it after removing it from the
* list that we are operating on.
*
* @param healthyNodes - List of healthy nodes that meet the size
* requirement.
* @return DatanodeDetails that is chosen.
*/
@Override
public DatanodeDetails chooseNode(List<DatanodeDetails> healthyNodes) {
int firstNodeNdx = getRand().nextInt(healthyNodes.size());
int secondNodeNdx = getRand().nextInt(healthyNodes.size());
DatanodeDetails datanodeDetails;
// There is a possibility that both numbers will be same.
// if that is so, we just return the node.
if (firstNodeNdx == secondNodeNdx) {
datanodeDetails = healthyNodes.get(firstNodeNdx);
} else {
DatanodeDetails firstNodeDetails = healthyNodes.get(firstNodeNdx);
DatanodeDetails secondNodeDetails = healthyNodes.get(secondNodeNdx);
SCMNodeMetric firstNodeMetric =
getNodeManager().getNodeStat(firstNodeDetails);
SCMNodeMetric secondNodeMetric =
getNodeManager().getNodeStat(secondNodeDetails);
datanodeDetails = firstNodeMetric.isGreater(secondNodeMetric.get())
? firstNodeDetails : secondNodeDetails;
}
healthyNodes.remove(datanodeDetails);
return datanodeDetails;
}
}