src/hdfs/org/apache/hadoop/hdfs/server/namenode/ReplicationTargetChooser.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.hdfs.server.namenode;

 import org.apache.commons.logging.*;
 import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
 import org.apache.hadoop.hdfs.protocol.FSConstants;
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.net.NetworkTopology;
 import org.apache.hadoop.net.Node;
 import org.apache.hadoop.net.NodeBase;
 import java.util.*;

 /** The class is responsible for choosing the desired number of targets
  * for placing block replicas.
  * The replica placement strategy is that if the writer is on a datanode,
  * the 1st replica is placed on the local machine,
  * otherwise a random datanode. The 2nd replica is placed on a datanode
  * that is on a different rack. The 3rd replica is placed on a datanode
  * which is on the same rack as the first replca.
  */
 class ReplicationTargetChooser {
   private final boolean considerLoad;
   private NetworkTopology clusterMap;
   private FSNamesystem fs;

   ReplicationTargetChooser(boolean considerLoad,  FSNamesystem fs,
                            NetworkTopology clusterMap) {
     this.considerLoad = considerLoad;
     this.fs = fs;
     this.clusterMap = clusterMap;
   }

   private static class NotEnoughReplicasException extends Exception {
     NotEnoughReplicasException(String msg) {
       super(msg);
     }
   }

   /**
    * choose <i>numOfReplicas</i> data nodes for <i>writer</i> to replicate
    * a block with size <i>blocksize</i>
    * If not, return as many as we can.
    *
    * @param numOfReplicas: number of replicas wanted.
    * @param writer: the writer's machine, null if not in the cluster.
    * @param excludedNodes: datanodesthat should not be considered targets.
    * @param blocksize: size of the data to be written.
    * @return array of DatanodeDescriptor instances chosen as targets
    * and sorted as a pipeline.
    */
   DatanodeDescriptor[] chooseTarget(int numOfReplicas,
                                     DatanodeDescriptor writer,
                                     List<Node> excludedNodes,
                                     long blocksize) {
     if (excludedNodes == null) {
       excludedNodes = new ArrayList<Node>();
     }

     return chooseTarget(numOfReplicas, writer,
                         new ArrayList<DatanodeDescriptor>(), excludedNodes, blocksize);
   }

   /**
    * choose <i>numOfReplicas</i> data nodes for <i>writer</i>
    * to re-replicate a block with size <i>blocksize</i>
    * If not, return as many as we can.
    *
    * @param numOfReplicas: additional number of replicas wanted.
    * @param writer: the writer's machine, null if not in the cluster.
    * @param choosenNodes: datanodes that have been choosen as targets.
    * @param excludedNodes: datanodesthat should not be considered targets.
    * @param blocksize: size of the data to be written.
    * @return array of DatanodeDescriptor instances chosen as target
    * and sorted as a pipeline.
    */
   DatanodeDescriptor[] chooseTarget(int numOfReplicas,
                                     DatanodeDescriptor writer,
                                     List<DatanodeDescriptor> choosenNodes,
                                     List<Node> excludedNodes,
                                     long blocksize) {
     if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
       return new DatanodeDescriptor[0];
     }

     if (excludedNodes == null) {
       excludedNodes = new ArrayList<Node>();
     }

     int clusterSize = clusterMap.getNumOfLeaves();
     int totalNumOfReplicas = choosenNodes.size()+numOfReplicas;
     if (totalNumOfReplicas > clusterSize) {
       numOfReplicas -= (totalNumOfReplicas-clusterSize);
       totalNumOfReplicas = clusterSize;
     }

     int maxNodesPerRack =
       (totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;

     List<DatanodeDescriptor> results =
       new ArrayList<DatanodeDescriptor>(choosenNodes);
     excludedNodes.addAll(choosenNodes);

     if (!clusterMap.contains(writer)) {
       writer=null;
     }

     DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
                                                 excludedNodes, blocksize, maxNodesPerRack, results);

     results.removeAll(choosenNodes);

     // sorting nodes to form a pipeline
     return getPipeline((writer==null)?localNode:writer,
                        results.toArray(new DatanodeDescriptor[results.size()]));
   }

   /* choose <i>numOfReplicas</i> from all data nodes */
   private DatanodeDescriptor chooseTarget(int numOfReplicas,
                                           DatanodeDescriptor writer,
                                           List<Node> excludedNodes,
                                           long blocksize,
                                           int maxNodesPerRack,
                                           List<DatanodeDescriptor> results) {

     if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
       return writer;
     }

     int numOfResults = results.size();
     boolean newBlock = (numOfResults==0);
     if (writer == null && !newBlock) {
       writer = (DatanodeDescriptor)results.get(0);
     }

     try {
       switch(numOfResults) {
       case 0:
         writer = chooseLocalNode(writer, excludedNodes,
                                  blocksize, maxNodesPerRack, results);
         if (--numOfReplicas == 0) {
           break;
         }
       case 1:
         chooseRemoteRack(1, results.get(0), excludedNodes,
                          blocksize, maxNodesPerRack, results);
         if (--numOfReplicas == 0) {
           break;
         }
       case 2:
         if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
           chooseRemoteRack(1, results.get(0), excludedNodes,
                            blocksize, maxNodesPerRack, results);
         } else if (newBlock){
           chooseLocalRack(results.get(1), excludedNodes, blocksize,
                           maxNodesPerRack, results);
         } else {
           chooseLocalRack(writer, excludedNodes, blocksize,
                           maxNodesPerRack, results);
         }
         if (--numOfReplicas == 0) {
           break;
         }
       default:
         chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes,
                      blocksize, maxNodesPerRack, results);
       }
     } catch (NotEnoughReplicasException e) {
       FSNamesystem.LOG.warn("Not able to place enough replicas, still in need of "
                + numOfReplicas);
     }
     return writer;
   }

   /* choose <i>localMachine</i> as the target.
    * if <i>localMachine</i> is not availabe,
    * choose a node on the same rack
    * @return the choosen node
    */
   private DatanodeDescriptor chooseLocalNode(
                                              DatanodeDescriptor localMachine,
                                              List<Node> excludedNodes,
                                              long blocksize,
                                              int maxNodesPerRack,
                                              List<DatanodeDescriptor> results)
     throws NotEnoughReplicasException {
     // if no local machine, randomly choose one node
     if (localMachine == null)
       return chooseRandom(NodeBase.ROOT, excludedNodes,
                           blocksize, maxNodesPerRack, results);

     // otherwise try local machine first
     if (!excludedNodes.contains(localMachine)) {
       excludedNodes.add(localMachine);
       if (isGoodTarget(localMachine, blocksize,
                        maxNodesPerRack, false, results)) {
         results.add(localMachine);
         return localMachine;
       }
     }

     // try a node on local rack
     return chooseLocalRack(localMachine, excludedNodes,
                            blocksize, maxNodesPerRack, results);
   }

   /* choose one node from the rack that <i>localMachine</i> is on.
    * if no such node is availabe, choose one node from the rack where
    * a second replica is on.
    * if still no such node is available, choose a random node
    * in the cluster.
    * @return the choosen node
    */
   private DatanodeDescriptor chooseLocalRack(
                                              DatanodeDescriptor localMachine,
                                              List<Node> excludedNodes,
                                              long blocksize,
                                              int maxNodesPerRack,
                                              List<DatanodeDescriptor> results)
     throws NotEnoughReplicasException {
     // no local machine, so choose a random machine
     if (localMachine == null) {
       return chooseRandom(NodeBase.ROOT, excludedNodes,
                           blocksize, maxNodesPerRack, results);
     }

     // choose one from the local rack
     try {
       return chooseRandom(
                           localMachine.getNetworkLocation(),
                           excludedNodes, blocksize, maxNodesPerRack, results);
     } catch (NotEnoughReplicasException e1) {
       // find the second replica
       DatanodeDescriptor newLocal=null;
       for(Iterator<DatanodeDescriptor> iter=results.iterator();
           iter.hasNext();) {
         DatanodeDescriptor nextNode = iter.next();
         if (nextNode != localMachine) {
           newLocal = nextNode;
           break;
         }
       }
       if (newLocal != null) {
         try {
           return chooseRandom(
                               newLocal.getNetworkLocation(),
                               excludedNodes, blocksize, maxNodesPerRack, results);
         } catch(NotEnoughReplicasException e2) {
           //otherwise randomly choose one from the network
           return chooseRandom(NodeBase.ROOT, excludedNodes,
                               blocksize, maxNodesPerRack, results);
         }
       } else {
         //otherwise randomly choose one from the network
         return chooseRandom(NodeBase.ROOT, excludedNodes,
                             blocksize, maxNodesPerRack, results);
       }
     }
   }

   /* choose <i>numOfReplicas</i> nodes from the racks
    * that <i>localMachine</i> is NOT on.
    * if not enough nodes are availabe, choose the remaining ones
    * from the local rack
    */

   private void chooseRemoteRack(int numOfReplicas,
                                 DatanodeDescriptor localMachine,
                                 List<Node> excludedNodes,
                                 long blocksize,
                                 int maxReplicasPerRack,
                                 List<DatanodeDescriptor> results)
     throws NotEnoughReplicasException {
     int oldNumOfReplicas = results.size();
     // randomly choose one node from remote racks
     try {
       chooseRandom(numOfReplicas, "~"+localMachine.getNetworkLocation(),
                    excludedNodes, blocksize, maxReplicasPerRack, results);
     } catch (NotEnoughReplicasException e) {
       chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
                    localMachine.getNetworkLocation(), excludedNodes, blocksize,
                    maxReplicasPerRack, results);
     }
   }

   /* Randomly choose one target from <i>nodes</i>.
    * @return the choosen node
    */
   private DatanodeDescriptor chooseRandom(
                                           String nodes,
                                           List<Node> excludedNodes,
                                           long blocksize,
                                           int maxNodesPerRack,
                                           List<DatanodeDescriptor> results)
     throws NotEnoughReplicasException {
     DatanodeDescriptor result;
     do {
       DatanodeDescriptor[] selectedNodes =
         chooseRandom(1, nodes, excludedNodes);
       if (selectedNodes.length == 0) {
         throw new NotEnoughReplicasException(
                                              "Not able to place enough replicas");
       }
       result = (DatanodeDescriptor)(selectedNodes[0]);
     } while(!isGoodTarget(result, blocksize, maxNodesPerRack, results));
     results.add(result);
     return result;
   }

   /* Randomly choose <i>numOfReplicas</i> targets from <i>nodes</i>.
    */
   private void chooseRandom(int numOfReplicas,
                             String nodes,
                             List<Node> excludedNodes,
                             long blocksize,
                             int maxNodesPerRack,
                             List<DatanodeDescriptor> results)
     throws NotEnoughReplicasException {
     boolean toContinue = true;
     do {
       DatanodeDescriptor[] selectedNodes =
         chooseRandom(numOfReplicas, nodes, excludedNodes);
       if (selectedNodes.length < numOfReplicas) {
         toContinue = false;
       }
       for(int i=0; i<selectedNodes.length; i++) {
         DatanodeDescriptor result = selectedNodes[i];
         if (isGoodTarget(result, blocksize, maxNodesPerRack, results)) {
           numOfReplicas--;
           results.add(result);
         }
       } // end of for
     } while (numOfReplicas>0 && toContinue);

     if (numOfReplicas>0) {
       throw new NotEnoughReplicasException(
                                            "Not able to place enough replicas");
     }
   }

   /* Randomly choose <i>numOfNodes</i> nodes from <i>scope</i>.
    * @return the choosen nodes
    */
   private DatanodeDescriptor[] chooseRandom(int numOfReplicas,
                                             String nodes,
                                             List<Node> excludedNodes) {
     List<DatanodeDescriptor> results =
       new ArrayList<DatanodeDescriptor>();
     int numOfAvailableNodes =
       clusterMap.countNumOfAvailableNodes(nodes, excludedNodes);
     numOfReplicas = (numOfAvailableNodes<numOfReplicas)?
       numOfAvailableNodes:numOfReplicas;
     while(numOfReplicas > 0) {
       DatanodeDescriptor choosenNode =
         (DatanodeDescriptor)(clusterMap.chooseRandom(nodes));
       if (!excludedNodes.contains(choosenNode)) {
         results.add(choosenNode);
         excludedNodes.add(choosenNode);
         numOfReplicas--;
       }
     }
     return (DatanodeDescriptor[])results.toArray(
                                                  new DatanodeDescriptor[results.size()]);
   }

   /* judge if a node is a good target.
    * return true if <i>node</i> has enough space,
    * does not have too much load, and the rack does not have too many nodes
    */
   private boolean isGoodTarget(DatanodeDescriptor node,
                                long blockSize, int maxTargetPerLoc,
                                List<DatanodeDescriptor> results) {
     return isGoodTarget(node, blockSize, maxTargetPerLoc,
                         this.considerLoad, results);
   }

   private boolean isGoodTarget(DatanodeDescriptor node,
                                long blockSize, int maxTargetPerLoc,
                                boolean considerLoad,
                                List<DatanodeDescriptor> results) {
     Log logr = FSNamesystem.LOG;
     // check if the node is (being) decommissed
     if (node.isDecommissionInProgress() || node.isDecommissioned()) {
       logr.debug("Node "+NodeBase.getPath(node)+
                 " is not chosen because the node is (being) decommissioned");
       return false;
     }

     long remaining = node.getRemaining() -
                      (node.getBlocksScheduled() * blockSize);
     // check the remaining capacity of the target machine
     if (blockSize* FSConstants.MIN_BLOCKS_FOR_WRITE>remaining) {
       logr.debug("Node "+NodeBase.getPath(node)+
                 " is not chosen because the node does not have enough space");
       return false;
     }

     // check the communication traffic of the target machine
     if (considerLoad) {
       double avgLoad = 0;
       int size = clusterMap.getNumOfLeaves();
       if (size != 0) {
         avgLoad = (double)fs.getTotalLoad()/size;
       }
       if (node.getXceiverCount() > (2.0 * avgLoad)) {
         logr.debug("Node "+NodeBase.getPath(node)+
                   " is not chosen because the node is too busy");
         return false;
       }
     }

     // check if the target rack has chosen too many nodes
     String rackname = node.getNetworkLocation();
     int counter=1;
     for(Iterator<DatanodeDescriptor> iter = results.iterator();
         iter.hasNext();) {
       Node result = iter.next();
       if (rackname.equals(result.getNetworkLocation())) {
         counter++;
       }
     }
     if (counter>maxTargetPerLoc) {
       logr.debug("Node "+NodeBase.getPath(node)+
                 " is not chosen because the rack has too many chosen nodes");
       return false;
     }
     return true;
   }

   /* Return a pipeline of nodes.
    * The pipeline is formed finding a shortest path that
    * starts from the writer and tranverses all <i>nodes</i>
    * This is basically a traveling salesman problem.
    */
   private DatanodeDescriptor[] getPipeline(
                                            DatanodeDescriptor writer,
                                            DatanodeDescriptor[] nodes) {
     if (nodes.length==0) return nodes;

     synchronized(clusterMap) {
       int index=0;
       if (writer == null || !clusterMap.contains(writer)) {
         writer = nodes[0];
       }
       for(;index<nodes.length; index++) {
         DatanodeDescriptor shortestNode = nodes[index];
         int shortestDistance = clusterMap.getDistance(writer, shortestNode);
         int shortestIndex = index;
         for(int i=index+1; i<nodes.length; i++) {
           DatanodeDescriptor currentNode = nodes[i];
           int currentDistance = clusterMap.getDistance(writer, currentNode);
           if (shortestDistance>currentDistance) {
             shortestDistance = currentDistance;
             shortestNode = currentNode;
             shortestIndex = i;
           }
         }
         //switch position index & shortestIndex
         if (index != shortestIndex) {
           nodes[shortestIndex] = nodes[index];
           nodes[index] = shortestNode;
         }
         writer = shortestNode;
       }
     }
     return nodes;
   }

   /**
    * Verify that the block is replicated on at least 2 different racks
    * if there is more than one rack in the system.
    *
    * @param lBlk block with locations
    * @param cluster
    * @return 1 if the block must be relicated on additional rack,
    * or 0 if the number of racks is sufficient.
    */
   public static int verifyBlockPlacement(LocatedBlock lBlk,
                                          short replication,
                                          NetworkTopology cluster) {
     int numRacks = verifyBlockPlacement(lBlk, Math.min(2,replication), cluster);
     return numRacks < 0 ? 0 : numRacks;
   }

   /**
    * Verify that the block is replicated on at least minRacks different racks
    * if there is more than minRacks rack in the system.
    *
    * @param lBlk block with locations
    * @param minRacks number of racks the block should be replicated to
    * @param cluster
    * @return the difference between the required and the actual number of racks
    * the block is replicated to.
    */
   public static int verifyBlockPlacement(LocatedBlock lBlk,
                                          int minRacks,
                                          NetworkTopology cluster) {
     DatanodeInfo[] locs = lBlk.getLocations();
     if (locs == null)
       locs = new DatanodeInfo[0];
     int numRacks = cluster.getNumOfRacks();
     if(numRacks <= 1) // only one rack
       return 0;
     minRacks = Math.min(minRacks, numRacks);
     // 1. Check that all locations are different.
     // 2. Count locations on different racks.
     Set<String> racks = new TreeSet<String>();
     for (DatanodeInfo dn : locs)
       racks.add(dn.getNetworkLocation());
     return minRacks - racks.size();
   }
 } //end of Replicator
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.hdfs.server.namenode;

	import org.apache.commons.logging.*;
	import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
	import org.apache.hadoop.hdfs.protocol.FSConstants;
	import org.apache.hadoop.hdfs.protocol.LocatedBlock;
	import org.apache.hadoop.net.NetworkTopology;
	import org.apache.hadoop.net.Node;
	import org.apache.hadoop.net.NodeBase;
	import java.util.*;

	/** The class is responsible for choosing the desired number of targets
	* for placing block replicas.
	* The replica placement strategy is that if the writer is on a datanode,
	* the 1st replica is placed on the local machine,
	* otherwise a random datanode. The 2nd replica is placed on a datanode
	* that is on a different rack. The 3rd replica is placed on a datanode
	* which is on the same rack as the first replca.
	*/
	class ReplicationTargetChooser {
	private final boolean considerLoad;
	private NetworkTopology clusterMap;
	private FSNamesystem fs;

	ReplicationTargetChooser(boolean considerLoad, FSNamesystem fs,
	NetworkTopology clusterMap) {
	this.considerLoad = considerLoad;
	this.fs = fs;
	this.clusterMap = clusterMap;
	}

	private static class NotEnoughReplicasException extends Exception {
	NotEnoughReplicasException(String msg) {
	super(msg);
	}
	}

	/**
	* choose <i>numOfReplicas</i> data nodes for <i>writer</i> to replicate
	* a block with size <i>blocksize</i>
	* If not, return as many as we can.
	*
	* @param numOfReplicas: number of replicas wanted.
	* @param writer: the writer's machine, null if not in the cluster.
	* @param excludedNodes: datanodesthat should not be considered targets.
	* @param blocksize: size of the data to be written.
	* @return array of DatanodeDescriptor instances chosen as targets
	* and sorted as a pipeline.
	*/
	DatanodeDescriptor[] chooseTarget(int numOfReplicas,
	DatanodeDescriptor writer,
	List<Node> excludedNodes,
	long blocksize) {
	if (excludedNodes == null) {
	excludedNodes = new ArrayList<Node>();
	}

	return chooseTarget(numOfReplicas, writer,
	new ArrayList<DatanodeDescriptor>(), excludedNodes, blocksize);
	}

	/**
	* choose <i>numOfReplicas</i> data nodes for <i>writer</i>
	* to re-replicate a block with size <i>blocksize</i>
	* If not, return as many as we can.
	*
	* @param numOfReplicas: additional number of replicas wanted.
	* @param writer: the writer's machine, null if not in the cluster.
	* @param choosenNodes: datanodes that have been choosen as targets.
	* @param excludedNodes: datanodesthat should not be considered targets.
	* @param blocksize: size of the data to be written.
	* @return array of DatanodeDescriptor instances chosen as target
	* and sorted as a pipeline.
	*/
	DatanodeDescriptor[] chooseTarget(int numOfReplicas,
	DatanodeDescriptor writer,
	List<DatanodeDescriptor> choosenNodes,
	List<Node> excludedNodes,
	long blocksize) {
	if (numOfReplicas == 0 \|\| clusterMap.getNumOfLeaves()==0) {
	return new DatanodeDescriptor[0];
	}

	if (excludedNodes == null) {
	excludedNodes = new ArrayList<Node>();
	}

	int clusterSize = clusterMap.getNumOfLeaves();
	int totalNumOfReplicas = choosenNodes.size()+numOfReplicas;
	if (totalNumOfReplicas > clusterSize) {
	numOfReplicas -= (totalNumOfReplicas-clusterSize);
	totalNumOfReplicas = clusterSize;
	}

	int maxNodesPerRack =
	(totalNumOfReplicas-1)/clusterMap.getNumOfRacks()+2;

	List<DatanodeDescriptor> results =
	new ArrayList<DatanodeDescriptor>(choosenNodes);
	excludedNodes.addAll(choosenNodes);

	if (!clusterMap.contains(writer)) {
	writer=null;
	}

	DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer,
	excludedNodes, blocksize, maxNodesPerRack, results);

	results.removeAll(choosenNodes);

	// sorting nodes to form a pipeline
	return getPipeline((writer==null)?localNode:writer,
	results.toArray(new DatanodeDescriptor[results.size()]));
	}

	/* choose <i>numOfReplicas</i> from all data nodes */
	private DatanodeDescriptor chooseTarget(int numOfReplicas,
	DatanodeDescriptor writer,
	List<Node> excludedNodes,
	long blocksize,
	int maxNodesPerRack,
	List<DatanodeDescriptor> results) {

	if (numOfReplicas == 0 \|\| clusterMap.getNumOfLeaves()==0) {
	return writer;
	}

	int numOfResults = results.size();
	boolean newBlock = (numOfResults==0);
	if (writer == null && !newBlock) {
	writer = (DatanodeDescriptor)results.get(0);
	}

	try {
	switch(numOfResults) {
	case 0:
	writer = chooseLocalNode(writer, excludedNodes,
	blocksize, maxNodesPerRack, results);
	if (--numOfReplicas == 0) {
	break;
	}
	case 1:
	chooseRemoteRack(1, results.get(0), excludedNodes,
	blocksize, maxNodesPerRack, results);
	if (--numOfReplicas == 0) {
	break;
	}
	case 2:
	if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
	chooseRemoteRack(1, results.get(0), excludedNodes,
	blocksize, maxNodesPerRack, results);
	} else if (newBlock){
	chooseLocalRack(results.get(1), excludedNodes, blocksize,
	maxNodesPerRack, results);
	} else {
	chooseLocalRack(writer, excludedNodes, blocksize,
	maxNodesPerRack, results);
	}
	if (--numOfReplicas == 0) {
	break;
	}
	default:
	chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes,
	blocksize, maxNodesPerRack, results);
	}
	} catch (NotEnoughReplicasException e) {
	FSNamesystem.LOG.warn("Not able to place enough replicas, still in need of "
	+ numOfReplicas);
	}
	return writer;
	}

	/* choose <i>localMachine</i> as the target.
	* if <i>localMachine</i> is not availabe,
	* choose a node on the same rack
	* @return the choosen node
	*/
	private DatanodeDescriptor chooseLocalNode(
	DatanodeDescriptor localMachine,
	List<Node> excludedNodes,
	long blocksize,
	int maxNodesPerRack,
	List<DatanodeDescriptor> results)
	throws NotEnoughReplicasException {
	// if no local machine, randomly choose one node
	if (localMachine == null)
	return chooseRandom(NodeBase.ROOT, excludedNodes,
	blocksize, maxNodesPerRack, results);

	// otherwise try local machine first
	if (!excludedNodes.contains(localMachine)) {
	excludedNodes.add(localMachine);
	if (isGoodTarget(localMachine, blocksize,
	maxNodesPerRack, false, results)) {
	results.add(localMachine);
	return localMachine;
	}
	}

	// try a node on local rack
	return chooseLocalRack(localMachine, excludedNodes,
	blocksize, maxNodesPerRack, results);
	}

	/* choose one node from the rack that <i>localMachine</i> is on.
	* if no such node is availabe, choose one node from the rack where
	* a second replica is on.
	* if still no such node is available, choose a random node
	* in the cluster.
	* @return the choosen node
	*/
	private DatanodeDescriptor chooseLocalRack(
	DatanodeDescriptor localMachine,
	List<Node> excludedNodes,
	long blocksize,
	int maxNodesPerRack,
	List<DatanodeDescriptor> results)
	throws NotEnoughReplicasException {
	// no local machine, so choose a random machine
	if (localMachine == null) {
	return chooseRandom(NodeBase.ROOT, excludedNodes,
	blocksize, maxNodesPerRack, results);
	}

	// choose one from the local rack
	try {
	return chooseRandom(
	localMachine.getNetworkLocation(),
	excludedNodes, blocksize, maxNodesPerRack, results);
	} catch (NotEnoughReplicasException e1) {
	// find the second replica
	DatanodeDescriptor newLocal=null;
	for(Iterator<DatanodeDescriptor> iter=results.iterator();
	iter.hasNext();) {
	DatanodeDescriptor nextNode = iter.next();
	if (nextNode != localMachine) {
	newLocal = nextNode;
	break;
	}
	}
	if (newLocal != null) {
	try {
	return chooseRandom(
	newLocal.getNetworkLocation(),
	excludedNodes, blocksize, maxNodesPerRack, results);
	} catch(NotEnoughReplicasException e2) {
	//otherwise randomly choose one from the network
	return chooseRandom(NodeBase.ROOT, excludedNodes,
	blocksize, maxNodesPerRack, results);
	}
	} else {
	//otherwise randomly choose one from the network
	return chooseRandom(NodeBase.ROOT, excludedNodes,
	blocksize, maxNodesPerRack, results);
	}
	}
	}

	/* choose <i>numOfReplicas</i> nodes from the racks
	* that <i>localMachine</i> is NOT on.
	* if not enough nodes are availabe, choose the remaining ones
	* from the local rack
	*/

	private void chooseRemoteRack(int numOfReplicas,
	DatanodeDescriptor localMachine,
	List<Node> excludedNodes,
	long blocksize,
	int maxReplicasPerRack,
	List<DatanodeDescriptor> results)
	throws NotEnoughReplicasException {
	int oldNumOfReplicas = results.size();
	// randomly choose one node from remote racks
	try {
	chooseRandom(numOfReplicas, "~"+localMachine.getNetworkLocation(),
	excludedNodes, blocksize, maxReplicasPerRack, results);
	} catch (NotEnoughReplicasException e) {
	chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
	localMachine.getNetworkLocation(), excludedNodes, blocksize,
	maxReplicasPerRack, results);
	}
	}

	/* Randomly choose one target from <i>nodes</i>.
	* @return the choosen node
	*/
	private DatanodeDescriptor chooseRandom(
	String nodes,
	List<Node> excludedNodes,
	long blocksize,
	int maxNodesPerRack,
	List<DatanodeDescriptor> results)
	throws NotEnoughReplicasException {
	DatanodeDescriptor result;
	do {
	DatanodeDescriptor[] selectedNodes =
	chooseRandom(1, nodes, excludedNodes);
	if (selectedNodes.length == 0) {
	throw new NotEnoughReplicasException(
	"Not able to place enough replicas");
	}
	result = (DatanodeDescriptor)(selectedNodes[0]);
	} while(!isGoodTarget(result, blocksize, maxNodesPerRack, results));
	results.add(result);
	return result;
	}

	/* Randomly choose <i>numOfReplicas</i> targets from <i>nodes</i>.
	*/
	private void chooseRandom(int numOfReplicas,
	String nodes,
	List<Node> excludedNodes,
	long blocksize,
	int maxNodesPerRack,
	List<DatanodeDescriptor> results)
	throws NotEnoughReplicasException {
	boolean toContinue = true;
	do {
	DatanodeDescriptor[] selectedNodes =
	chooseRandom(numOfReplicas, nodes, excludedNodes);
	if (selectedNodes.length < numOfReplicas) {
	toContinue = false;
	}
	for(int i=0; i<selectedNodes.length; i++) {
	DatanodeDescriptor result = selectedNodes[i];
	if (isGoodTarget(result, blocksize, maxNodesPerRack, results)) {
	numOfReplicas--;
	results.add(result);
	}
	} // end of for
	} while (numOfReplicas>0 && toContinue);

	if (numOfReplicas>0) {
	throw new NotEnoughReplicasException(
	"Not able to place enough replicas");
	}
	}

	/* Randomly choose <i>numOfNodes</i> nodes from <i>scope</i>.
	* @return the choosen nodes
	*/
	private DatanodeDescriptor[] chooseRandom(int numOfReplicas,
	String nodes,
	List<Node> excludedNodes) {
	List<DatanodeDescriptor> results =
	new ArrayList<DatanodeDescriptor>();
	int numOfAvailableNodes =
	clusterMap.countNumOfAvailableNodes(nodes, excludedNodes);
	numOfReplicas = (numOfAvailableNodes<numOfReplicas)?
	numOfAvailableNodes:numOfReplicas;
	while(numOfReplicas > 0) {
	DatanodeDescriptor choosenNode =
	(DatanodeDescriptor)(clusterMap.chooseRandom(nodes));
	if (!excludedNodes.contains(choosenNode)) {
	results.add(choosenNode);
	excludedNodes.add(choosenNode);
	numOfReplicas--;
	}
	}
	return (DatanodeDescriptor[])results.toArray(
	new DatanodeDescriptor[results.size()]);
	}

	/* judge if a node is a good target.
	* return true if <i>node</i> has enough space,
	* does not have too much load, and the rack does not have too many nodes
	*/
	private boolean isGoodTarget(DatanodeDescriptor node,
	long blockSize, int maxTargetPerLoc,
	List<DatanodeDescriptor> results) {
	return isGoodTarget(node, blockSize, maxTargetPerLoc,
	this.considerLoad, results);
	}

	private boolean isGoodTarget(DatanodeDescriptor node,
	long blockSize, int maxTargetPerLoc,
	boolean considerLoad,
	List<DatanodeDescriptor> results) {
	Log logr = FSNamesystem.LOG;
	// check if the node is (being) decommissed
	if (node.isDecommissionInProgress() \|\| node.isDecommissioned()) {
	logr.debug("Node "+NodeBase.getPath(node)+
	" is not chosen because the node is (being) decommissioned");
	return false;
	}

	long remaining = node.getRemaining() -
	(node.getBlocksScheduled() * blockSize);
	// check the remaining capacity of the target machine
	if (blockSize* FSConstants.MIN_BLOCKS_FOR_WRITE>remaining) {
	logr.debug("Node "+NodeBase.getPath(node)+
	" is not chosen because the node does not have enough space");
	return false;
	}

	// check the communication traffic of the target machine
	if (considerLoad) {
	double avgLoad = 0;
	int size = clusterMap.getNumOfLeaves();
	if (size != 0) {
	avgLoad = (double)fs.getTotalLoad()/size;
	}
	if (node.getXceiverCount() > (2.0 * avgLoad)) {
	logr.debug("Node "+NodeBase.getPath(node)+
	" is not chosen because the node is too busy");
	return false;
	}
	}

	// check if the target rack has chosen too many nodes
	String rackname = node.getNetworkLocation();
	int counter=1;
	for(Iterator<DatanodeDescriptor> iter = results.iterator();
	iter.hasNext();) {
	Node result = iter.next();
	if (rackname.equals(result.getNetworkLocation())) {
	counter++;
	}
	}
	if (counter>maxTargetPerLoc) {
	logr.debug("Node "+NodeBase.getPath(node)+
	" is not chosen because the rack has too many chosen nodes");
	return false;
	}
	return true;
	}

	/* Return a pipeline of nodes.
	* The pipeline is formed finding a shortest path that
	* starts from the writer and tranverses all <i>nodes</i>
	* This is basically a traveling salesman problem.
	*/
	private DatanodeDescriptor[] getPipeline(
	DatanodeDescriptor writer,
	DatanodeDescriptor[] nodes) {
	if (nodes.length==0) return nodes;

	synchronized(clusterMap) {
	int index=0;
	if (writer == null \|\| !clusterMap.contains(writer)) {
	writer = nodes[0];
	}
	for(;index<nodes.length; index++) {
	DatanodeDescriptor shortestNode = nodes[index];
	int shortestDistance = clusterMap.getDistance(writer, shortestNode);
	int shortestIndex = index;
	for(int i=index+1; i<nodes.length; i++) {
	DatanodeDescriptor currentNode = nodes[i];
	int currentDistance = clusterMap.getDistance(writer, currentNode);
	if (shortestDistance>currentDistance) {
	shortestDistance = currentDistance;
	shortestNode = currentNode;
	shortestIndex = i;
	}
	}
	//switch position index & shortestIndex
	if (index != shortestIndex) {
	nodes[shortestIndex] = nodes[index];
	nodes[index] = shortestNode;
	}
	writer = shortestNode;
	}
	}
	return nodes;
	}

	/**
	* Verify that the block is replicated on at least 2 different racks
	* if there is more than one rack in the system.
	*
	* @param lBlk block with locations
	* @param cluster
	* @return 1 if the block must be relicated on additional rack,
	* or 0 if the number of racks is sufficient.
	*/
	public static int verifyBlockPlacement(LocatedBlock lBlk,
	short replication,
	NetworkTopology cluster) {
	int numRacks = verifyBlockPlacement(lBlk, Math.min(2,replication), cluster);
	return numRacks < 0 ? 0 : numRacks;
	}

	/**
	* Verify that the block is replicated on at least minRacks different racks
	* if there is more than minRacks rack in the system.
	*
	* @param lBlk block with locations
	* @param minRacks number of racks the block should be replicated to
	* @param cluster
	* @return the difference between the required and the actual number of racks
	* the block is replicated to.
	*/
	public static int verifyBlockPlacement(LocatedBlock lBlk,
	int minRacks,
	NetworkTopology cluster) {
	DatanodeInfo[] locs = lBlk.getLocations();
	if (locs == null)
	locs = new DatanodeInfo[0];
	int numRacks = cluster.getNumOfRacks();
	if(numRacks <= 1) // only one rack
	return 0;
	minRacks = Math.min(minRacks, numRacks);
	// 1. Check that all locations are different.
	// 2. Count locations on different racks.
	Set<String> racks = new TreeSet<String>();
	for (DatanodeInfo dn : locs)
	racks.add(dn.getNetworkLocation());
	return minRacks - racks.size();
	}
	} //end of Replicator