mapreduce/src/java/org/apache/hadoop/mapreduce/lib/input/CombineFileInputFormat.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.mapreduce.lib.input;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.HashMap;
 import java.util.Set;
 import java.util.Iterator;
 import java.util.Map;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.net.NodeBase;
 import org.apache.hadoop.net.NetworkTopology;

 /**
  * An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in
  * {@link InputFormat#getSplits(JobContext)} method.
  *
  * Splits are constructed from the files under the input paths.
  * A split cannot have files from different pools.
  * Each split returned may contain blocks from different files.
  * If a maxSplitSize is specified, then blocks on the same node are
  * combined to form a single split. Blocks that are left over are
  * then combined with other blocks in the same rack.
  * If maxSplitSize is not specified, then blocks from the same rack
  * are combined in a single split; no attempt is made to create
  * node-local splits.
  * If the maxSplitSize is equal to the block size, then this class
  * is similar to the default splitting behavior in Hadoop: each
  * block is a locally processed split.
  * Subclasses implement
  * {@link InputFormat#createRecordReader(InputSplit, TaskAttemptContext)}
  * to construct <code>RecordReader</code>'s for
  * <code>CombineFileSplit</code>'s.
  *
  * @see CombineFileSplit
  */
 public abstract class CombineFileInputFormat<K, V>
   extends FileInputFormat<K, V> {

   public static final String SPLIT_MINSIZE_PERNODE =
     "mapreduce.input.fileinputformat.split.minsize.per.node";
   public static final String SPLIT_MINSIZE_PERRACK =
     "mapreduce.input.fileinputformat.split.minsize.per.rack";
   // ability to limit the size of a single split
   private long maxSplitSize = 0;
   private long minSplitSizeNode = 0;
   private long minSplitSizeRack = 0;

   // A pool of input paths filters. A split cannot have blocks from files
   // across multiple pools.
   private ArrayList<MultiPathFilter> pools = new  ArrayList<MultiPathFilter>();

   // mapping from a rack name to the set of Nodes in the rack
   private static HashMap<String, Set<String>> rackToNodes =
                             new HashMap<String, Set<String>>();
   /**
    * Specify the maximum size (in bytes) of each split. Each split is
    * approximately equal to the specified size.
    */
   protected void setMaxSplitSize(long maxSplitSize) {
     this.maxSplitSize = maxSplitSize;
   }

   /**
    * Specify the minimum size (in bytes) of each split per node.
    * This applies to data that is left over after combining data on a single
    * node into splits that are of maximum size specified by maxSplitSize.
    * This leftover data will be combined into its own split if its size
    * exceeds minSplitSizeNode.
    */
   protected void setMinSplitSizeNode(long minSplitSizeNode) {
     this.minSplitSizeNode = minSplitSizeNode;
   }

   /**
    * Specify the minimum size (in bytes) of each split per rack.
    * This applies to data that is left over after combining data on a single
    * rack into splits that are of maximum size specified by maxSplitSize.
    * This leftover data will be combined into its own split if its size
    * exceeds minSplitSizeRack.
    */
   protected void setMinSplitSizeRack(long minSplitSizeRack) {
     this.minSplitSizeRack = minSplitSizeRack;
   }

   /**
    * Create a new pool and add the filters to it.
    * A split cannot have files from different pools.
    */
   protected void createPool(List<PathFilter> filters) {
     pools.add(new MultiPathFilter(filters));
   }

   /**
    * Create a new pool and add the filters to it.
    * A pathname can satisfy any one of the specified filters.
    * A split cannot have files from different pools.
    */
   protected void createPool(PathFilter... filters) {
     MultiPathFilter multi = new MultiPathFilter();
     for (PathFilter f: filters) {
       multi.add(f);
     }
     pools.add(multi);
   }

   /**
    * default constructor
    */
   public CombineFileInputFormat() {
   }

   @Override
   public List<InputSplit> getSplits(JobContext job)
     throws IOException {

     long minSizeNode = 0;
     long minSizeRack = 0;
     long maxSize = 0;
     Configuration conf = job.getConfiguration();

     // the values specified by setxxxSplitSize() takes precedence over the
     // values that might have been specified in the config
     if (minSplitSizeNode != 0) {
       minSizeNode = minSplitSizeNode;
     } else {
       minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
     }
     if (minSplitSizeRack != 0) {
       minSizeRack = minSplitSizeRack;
     } else {
       minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
     }
     if (maxSplitSize != 0) {
       maxSize = maxSplitSize;
     } else {
       maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
     }
     if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
       throw new IOException("Minimum split size pernode " + minSizeNode +
                             " cannot be larger than maximum split size " +
                             maxSize);
     }
     if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
       throw new IOException("Minimum split size per rack" + minSizeRack +
                             " cannot be larger than maximum split size " +
                             maxSize);
     }
     if (minSizeRack != 0 && minSizeNode > minSizeRack) {
       throw new IOException("Minimum split size per node" + minSizeNode +
                             " cannot be smaller than minimum split " +
                             "size per rack " + minSizeRack);
     }

     // all the files in input set
     Path[] paths = FileUtil.stat2Paths(
                      listStatus(job).toArray(new FileStatus[0]));
     List<InputSplit> splits = new ArrayList<InputSplit>();
     if (paths.length == 0) {
       return splits;
     }

     // In one single iteration, process all the paths in a single pool.
     // Processing one pool at a time ensures that a split contains paths
     // from a single pool only.
     for (MultiPathFilter onepool : pools) {
       ArrayList<Path> myPaths = new ArrayList<Path>();

       // pick one input path. If it matches all the filters in a pool,
       // add it to the output set
       for (int i = 0; i < paths.length; i++) {
         if (paths[i] == null) {  // already processed
           continue;
         }
         Path p = new Path(paths[i].toUri().getPath());
         if (onepool.accept(p)) {
           myPaths.add(paths[i]); // add it to my output set
           paths[i] = null;       // already processed
         }
       }
       // create splits for all files in this pool.
       getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]),
                     maxSize, minSizeNode, minSizeRack, splits);
     }

     // Finally, process all paths that do not belong to any pool.
     ArrayList<Path> myPaths = new ArrayList<Path>();
     for (int i = 0; i < paths.length; i++) {
       if (paths[i] == null) {  // already processed
         continue;
       }
       myPaths.add(paths[i]);
     }
     // create splits for all files that are not in any pool.
     getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]),
                   maxSize, minSizeNode, minSizeRack, splits);

     // free up rackToNodes map
     rackToNodes.clear();
     return splits;
   }

   /**
    * Return all the splits in the specified set of paths
    */
   private void getMoreSplits(Configuration conf, Path[] paths,
                              long maxSize, long minSizeNode, long minSizeRack,
                              List<InputSplit> splits)
     throws IOException {

     // all blocks for all the files in input set
     OneFileInfo[] files;

     // mapping from a rack name to the list of blocks it has
     HashMap<String, List<OneBlockInfo>> rackToBlocks =
                               new HashMap<String, List<OneBlockInfo>>();

     // mapping from a block to the nodes on which it has replicas
     HashMap<OneBlockInfo, String[]> blockToNodes =
                               new HashMap<OneBlockInfo, String[]>();

     // mapping from a node to the list of blocks that it contains
     HashMap<String, List<OneBlockInfo>> nodeToBlocks =
                               new HashMap<String, List<OneBlockInfo>>();

     files = new OneFileInfo[paths.length];
     if (paths.length == 0) {
       return;
     }

     // populate all the blocks for all files
     long totLength = 0;
     for (int i = 0; i < paths.length; i++) {
       files[i] = new OneFileInfo(paths[i], conf,
                                  rackToBlocks, blockToNodes, nodeToBlocks);
       totLength += files[i].getLength();
     }

     ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
     ArrayList<String> nodes = new ArrayList<String>();
     long curSplitSize = 0;

     // process all nodes and create splits that are local
     // to a node.
     for (Iterator<Map.Entry<String,
          List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator();
          iter.hasNext();) {

       Map.Entry<String, List<OneBlockInfo>> one = iter.next();
       nodes.add(one.getKey());
       List<OneBlockInfo> blocksInNode = one.getValue();

       // for each block, copy it into validBlocks. Delete it from
       // blockToNodes so that the same block does not appear in
       // two different splits.
       for (OneBlockInfo oneblock : blocksInNode) {
         if (blockToNodes.containsKey(oneblock)) {
           validBlocks.add(oneblock);
           blockToNodes.remove(oneblock);
           curSplitSize += oneblock.length;

           // if the accumulated split size exceeds the maximum, then
           // create this split.
           if (maxSize != 0 && curSplitSize >= maxSize) {
             // create an input split and add it to the splits array
             addCreatedSplit(splits, nodes, validBlocks);
             curSplitSize = 0;
             validBlocks.clear();
           }
         }
       }
       // if there were any blocks left over and their combined size is
       // larger than minSplitNode, then combine them into one split.
       // Otherwise add them back to the unprocessed pool. It is likely
       // that they will be combined with other blocks from the
       // same rack later on.
       if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
         // create an input split and add it to the splits array
         addCreatedSplit(splits, nodes, validBlocks);
       } else {
         for (OneBlockInfo oneblock : validBlocks) {
           blockToNodes.put(oneblock, oneblock.hosts);
         }
       }
       validBlocks.clear();
       nodes.clear();
       curSplitSize = 0;
     }

     // if blocks in a rack are below the specified minimum size, then keep them
     // in 'overflow'. After the processing of all racks is complete, these
     // overflow blocks will be combined into splits.
     ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
     ArrayList<String> racks = new ArrayList<String>();

     // Process all racks over and over again until there is no more work to do.
     while (blockToNodes.size() > 0) {

       // Create one split for this rack before moving over to the next rack.
       // Come back to this rack after creating a single split for each of the
       // remaining racks.
       // Process one rack location at a time, Combine all possible blocks that
       // reside on this rack as one split. (constrained by minimum and maximum
       // split size).

       // iterate over all racks
       for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter =
            rackToBlocks.entrySet().iterator(); iter.hasNext();) {

         Map.Entry<String, List<OneBlockInfo>> one = iter.next();
         racks.add(one.getKey());
         List<OneBlockInfo> blocks = one.getValue();

         // for each block, copy it into validBlocks. Delete it from
         // blockToNodes so that the same block does not appear in
         // two different splits.
         boolean createdSplit = false;
         for (OneBlockInfo oneblock : blocks) {
           if (blockToNodes.containsKey(oneblock)) {
             validBlocks.add(oneblock);
             blockToNodes.remove(oneblock);
             curSplitSize += oneblock.length;

             // if the accumulated split size exceeds the maximum, then
             // create this split.
             if (maxSize != 0 && curSplitSize >= maxSize) {
               // create an input split and add it to the splits array
               addCreatedSplit(splits, getHosts(racks), validBlocks);
               createdSplit = true;
               break;
             }
           }
         }

         // if we created a split, then just go to the next rack
         if (createdSplit) {
           curSplitSize = 0;
           validBlocks.clear();
           racks.clear();
           continue;
         }

         if (!validBlocks.isEmpty()) {
           if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
             // if there is a minimum size specified, then create a single split
             // otherwise, store these blocks into overflow data structure
             addCreatedSplit(splits, getHosts(racks), validBlocks);
           } else {
             // There were a few blocks in this rack that
         	// remained to be processed. Keep them in 'overflow' block list.
         	// These will be combined later.
             overflowBlocks.addAll(validBlocks);
           }
         }
         curSplitSize = 0;
         validBlocks.clear();
         racks.clear();
       }
     }

     assert blockToNodes.isEmpty();
     assert curSplitSize == 0;
     assert validBlocks.isEmpty();
     assert racks.isEmpty();

     // Process all overflow blocks
     for (OneBlockInfo oneblock : overflowBlocks) {
       validBlocks.add(oneblock);
       curSplitSize += oneblock.length;

       // This might cause an exiting rack location to be re-added,
       // but it should be ok.
       for (int i = 0; i < oneblock.racks.length; i++) {
         racks.add(oneblock.racks[i]);
       }

       // if the accumulated split size exceeds the maximum, then
       // create this split.
       if (maxSize != 0 && curSplitSize >= maxSize) {
         // create an input split and add it to the splits array
         addCreatedSplit(splits, getHosts(racks), validBlocks);
         curSplitSize = 0;
         validBlocks.clear();
         racks.clear();
       }
     }

     // Process any remaining blocks, if any.
     if (!validBlocks.isEmpty()) {
       addCreatedSplit(splits, getHosts(racks), validBlocks);
     }
   }

   /**
    * Create a single split from the list of blocks specified in validBlocks
    * Add this new split into splitList.
    */
   private void addCreatedSplit(List<InputSplit> splitList,
                                List<String> locations,
                                ArrayList<OneBlockInfo> validBlocks) {
     // create an input split
     Path[] fl = new Path[validBlocks.size()];
     long[] offset = new long[validBlocks.size()];
     long[] length = new long[validBlocks.size()];
     for (int i = 0; i < validBlocks.size(); i++) {
       fl[i] = validBlocks.get(i).onepath;
       offset[i] = validBlocks.get(i).offset;
       length[i] = validBlocks.get(i).length;
     }

      // add this split to the list that is returned
     CombineFileSplit thissplit = new CombineFileSplit(fl, offset,
                                    length, locations.toArray(new String[0]));
     splitList.add(thissplit);
   }

   /**
    * This is not implemented yet.
    */
   public abstract RecordReader<K, V> createRecordReader(InputSplit split,
       TaskAttemptContext context) throws IOException;

   /**
    * information about one file from the File System
    */
   private static class OneFileInfo {
     private long fileSize;               // size of the file
     private OneBlockInfo[] blocks;       // all blocks in this file

     OneFileInfo(Path path, Configuration conf,
                 HashMap<String, List<OneBlockInfo>> rackToBlocks,
                 HashMap<OneBlockInfo, String[]> blockToNodes,
                 HashMap<String, List<OneBlockInfo>> nodeToBlocks)
                 throws IOException {
       this.fileSize = 0;

       // get block locations from file system
       FileSystem fs = path.getFileSystem(conf);
       FileStatus stat = fs.getFileStatus(path);
       BlockLocation[] locations = fs.getFileBlockLocations(stat, 0,
                                                            stat.getLen());
       // create a list of all block and their locations
       if (locations == null) {
         blocks = new OneBlockInfo[0];
       } else {
         blocks = new OneBlockInfo[locations.length];
         for (int i = 0; i < locations.length; i++) {

           fileSize += locations[i].getLength();
           OneBlockInfo oneblock =  new OneBlockInfo(path,
                                        locations[i].getOffset(),
                                        locations[i].getLength(),
                                        locations[i].getHosts(),
                                        locations[i].getTopologyPaths());
           blocks[i] = oneblock;

           // add this block to the block --> node locations map
           blockToNodes.put(oneblock, oneblock.hosts);

           // add this block to the rack --> block map
           for (int j = 0; j < oneblock.racks.length; j++) {
             String rack = oneblock.racks[j];
             List<OneBlockInfo> blklist = rackToBlocks.get(rack);
             if (blklist == null) {
               blklist = new ArrayList<OneBlockInfo>();
               rackToBlocks.put(rack, blklist);
             }
             blklist.add(oneblock);
             // Add this host to rackToNodes map
             addHostToRack(oneblock.racks[j], oneblock.hosts[j]);
          }

           // add this block to the node --> block map
           for (int j = 0; j < oneblock.hosts.length; j++) {
             String node = oneblock.hosts[j];
             List<OneBlockInfo> blklist = nodeToBlocks.get(node);
             if (blklist == null) {
               blklist = new ArrayList<OneBlockInfo>();
               nodeToBlocks.put(node, blklist);
             }
             blklist.add(oneblock);
           }
         }
       }
     }

     long getLength() {
       return fileSize;
     }

     OneBlockInfo[] getBlocks() {
       return blocks;
     }
   }

   /**
    * information about one block from the File System
    */
   private static class OneBlockInfo {
     Path onepath;                // name of this file
     long offset;                 // offset in file
     long length;                 // length of this block
     String[] hosts;              // nodes on which this block resides
     String[] racks;              // network topology of hosts

     OneBlockInfo(Path path, long offset, long len,
                  String[] hosts, String[] topologyPaths) {
       this.onepath = path;
       this.offset = offset;
       this.hosts = hosts;
       this.length = len;
       assert (hosts.length == topologyPaths.length ||
               topologyPaths.length == 0);

       // if the file system does not have any rack information, then
       // use dummy rack location.
       if (topologyPaths.length == 0) {
         topologyPaths = new String[hosts.length];
         for (int i = 0; i < topologyPaths.length; i++) {
           topologyPaths[i] = (new NodeBase(hosts[i],
                               NetworkTopology.DEFAULT_RACK)).toString();
         }
       }

       // The topology paths have the host name included as the last
       // component. Strip it.
       this.racks = new String[topologyPaths.length];
       for (int i = 0; i < topologyPaths.length; i++) {
         this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
       }
     }
   }

   private static void addHostToRack(String rack, String host) {
     Set<String> hosts = rackToNodes.get(rack);
     if (hosts == null) {
       hosts = new HashSet<String>();
       rackToNodes.put(rack, hosts);
     }
     hosts.add(host);
   }

   private static List<String> getHosts(List<String> racks) {
     List<String> hosts = new ArrayList<String>();
     for (String rack : racks) {
       hosts.addAll(rackToNodes.get(rack));
     }
     return hosts;
   }

   /**
    * Accept a path only if any one of filters given in the
    * constructor do.
    */
   private static class MultiPathFilter implements PathFilter {
     private List<PathFilter> filters;

     public MultiPathFilter() {
       this.filters = new ArrayList<PathFilter>();
     }

     public MultiPathFilter(List<PathFilter> filters) {
       this.filters = filters;
     }

     public void add(PathFilter one) {
       filters.add(one);
     }

     public boolean accept(Path path) {
       for (PathFilter filter : filters) {
         if (filter.accept(path)) {
           return true;
         }
       }
       return false;
     }

     public String toString() {
       StringBuffer buf = new StringBuffer();
       buf.append("[");
       for (PathFilter f: filters) {
         buf.append(f);
         buf.append(",");
       }
       buf.append("]");
       return buf.toString();
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.mapreduce.lib.input;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.HashMap;
	import java.util.Set;
	import java.util.Iterator;
	import java.util.Map;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.FileUtil;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.fs.BlockLocation;
	import org.apache.hadoop.fs.FileStatus;
	import org.apache.hadoop.fs.PathFilter;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.JobContext;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.net.NodeBase;
	import org.apache.hadoop.net.NetworkTopology;

	/**
	* An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in
	* {@link InputFormat#getSplits(JobContext)} method.
	*
	* Splits are constructed from the files under the input paths.
	* A split cannot have files from different pools.
	* Each split returned may contain blocks from different files.
	* If a maxSplitSize is specified, then blocks on the same node are
	* combined to form a single split. Blocks that are left over are
	* then combined with other blocks in the same rack.
	* If maxSplitSize is not specified, then blocks from the same rack
	* are combined in a single split; no attempt is made to create
	* node-local splits.
	* If the maxSplitSize is equal to the block size, then this class
	* is similar to the default splitting behavior in Hadoop: each
	* block is a locally processed split.
	* Subclasses implement
	* {@link InputFormat#createRecordReader(InputSplit, TaskAttemptContext)}
	* to construct <code>RecordReader</code>'s for
	* <code>CombineFileSplit</code>'s.
	*
	* @see CombineFileSplit
	*/
	public abstract class CombineFileInputFormat<K, V>
	extends FileInputFormat<K, V> {

	public static final String SPLIT_MINSIZE_PERNODE =
	"mapreduce.input.fileinputformat.split.minsize.per.node";
	public static final String SPLIT_MINSIZE_PERRACK =
	"mapreduce.input.fileinputformat.split.minsize.per.rack";
	// ability to limit the size of a single split
	private long maxSplitSize = 0;
	private long minSplitSizeNode = 0;
	private long minSplitSizeRack = 0;

	// A pool of input paths filters. A split cannot have blocks from files
	// across multiple pools.
	private ArrayList<MultiPathFilter> pools = new ArrayList<MultiPathFilter>();

	// mapping from a rack name to the set of Nodes in the rack
	private static HashMap<String, Set<String>> rackToNodes =
	new HashMap<String, Set<String>>();
	/**
	* Specify the maximum size (in bytes) of each split. Each split is
	* approximately equal to the specified size.
	*/
	protected void setMaxSplitSize(long maxSplitSize) {
	this.maxSplitSize = maxSplitSize;
	}

	/**
	* Specify the minimum size (in bytes) of each split per node.
	* This applies to data that is left over after combining data on a single
	* node into splits that are of maximum size specified by maxSplitSize.
	* This leftover data will be combined into its own split if its size
	* exceeds minSplitSizeNode.
	*/
	protected void setMinSplitSizeNode(long minSplitSizeNode) {
	this.minSplitSizeNode = minSplitSizeNode;
	}

	/**
	* Specify the minimum size (in bytes) of each split per rack.
	* This applies to data that is left over after combining data on a single
	* rack into splits that are of maximum size specified by maxSplitSize.
	* This leftover data will be combined into its own split if its size
	* exceeds minSplitSizeRack.
	*/
	protected void setMinSplitSizeRack(long minSplitSizeRack) {
	this.minSplitSizeRack = minSplitSizeRack;
	}

	/**
	* Create a new pool and add the filters to it.
	* A split cannot have files from different pools.
	*/
	protected void createPool(List<PathFilter> filters) {
	pools.add(new MultiPathFilter(filters));
	}

	/**
	* Create a new pool and add the filters to it.
	* A pathname can satisfy any one of the specified filters.
	* A split cannot have files from different pools.
	*/
	protected void createPool(PathFilter... filters) {
	MultiPathFilter multi = new MultiPathFilter();
	for (PathFilter f: filters) {
	multi.add(f);
	}
	pools.add(multi);
	}

	/**
	* default constructor
	*/
	public CombineFileInputFormat() {
	}

	@Override
	public List<InputSplit> getSplits(JobContext job)
	throws IOException {

	long minSizeNode = 0;
	long minSizeRack = 0;
	long maxSize = 0;
	Configuration conf = job.getConfiguration();

	// the values specified by setxxxSplitSize() takes precedence over the
	// values that might have been specified in the config
	if (minSplitSizeNode != 0) {
	minSizeNode = minSplitSizeNode;
	} else {
	minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
	}
	if (minSplitSizeRack != 0) {
	minSizeRack = minSplitSizeRack;
	} else {
	minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
	}
	if (maxSplitSize != 0) {
	maxSize = maxSplitSize;
	} else {
	maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
	}
	if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
	throw new IOException("Minimum split size pernode " + minSizeNode +
	" cannot be larger than maximum split size " +
	maxSize);
	}
	if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
	throw new IOException("Minimum split size per rack" + minSizeRack +
	" cannot be larger than maximum split size " +
	maxSize);
	}
	if (minSizeRack != 0 && minSizeNode > minSizeRack) {
	throw new IOException("Minimum split size per node" + minSizeNode +
	" cannot be smaller than minimum split " +
	"size per rack " + minSizeRack);
	}

	// all the files in input set
	Path[] paths = FileUtil.stat2Paths(
	listStatus(job).toArray(new FileStatus[0]));
	List<InputSplit> splits = new ArrayList<InputSplit>();
	if (paths.length == 0) {
	return splits;
	}

	// In one single iteration, process all the paths in a single pool.
	// Processing one pool at a time ensures that a split contains paths
	// from a single pool only.
	for (MultiPathFilter onepool : pools) {
	ArrayList<Path> myPaths = new ArrayList<Path>();

	// pick one input path. If it matches all the filters in a pool,
	// add it to the output set
	for (int i = 0; i < paths.length; i++) {
	if (paths[i] == null) { // already processed
	continue;
	}
	Path p = new Path(paths[i].toUri().getPath());
	if (onepool.accept(p)) {
	myPaths.add(paths[i]); // add it to my output set
	paths[i] = null; // already processed
	}
	}
	// create splits for all files in this pool.
	getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]),
	maxSize, minSizeNode, minSizeRack, splits);
	}

	// Finally, process all paths that do not belong to any pool.
	ArrayList<Path> myPaths = new ArrayList<Path>();
	for (int i = 0; i < paths.length; i++) {
	if (paths[i] == null) { // already processed
	continue;
	}
	myPaths.add(paths[i]);
	}
	// create splits for all files that are not in any pool.
	getMoreSplits(conf, myPaths.toArray(new Path[myPaths.size()]),
	maxSize, minSizeNode, minSizeRack, splits);

	// free up rackToNodes map
	rackToNodes.clear();
	return splits;
	}

	/**
	* Return all the splits in the specified set of paths
	*/
	private void getMoreSplits(Configuration conf, Path[] paths,
	long maxSize, long minSizeNode, long minSizeRack,
	List<InputSplit> splits)
	throws IOException {

	// all blocks for all the files in input set
	OneFileInfo[] files;

	// mapping from a rack name to the list of blocks it has
	HashMap<String, List<OneBlockInfo>> rackToBlocks =
	new HashMap<String, List<OneBlockInfo>>();

	// mapping from a block to the nodes on which it has replicas
	HashMap<OneBlockInfo, String[]> blockToNodes =
	new HashMap<OneBlockInfo, String[]>();

	// mapping from a node to the list of blocks that it contains
	HashMap<String, List<OneBlockInfo>> nodeToBlocks =
	new HashMap<String, List<OneBlockInfo>>();

	files = new OneFileInfo[paths.length];
	if (paths.length == 0) {
	return;
	}

	// populate all the blocks for all files
	long totLength = 0;
	for (int i = 0; i < paths.length; i++) {
	files[i] = new OneFileInfo(paths[i], conf,
	rackToBlocks, blockToNodes, nodeToBlocks);
	totLength += files[i].getLength();
	}

	ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
	ArrayList<String> nodes = new ArrayList<String>();
	long curSplitSize = 0;

	// process all nodes and create splits that are local
	// to a node.
	for (Iterator<Map.Entry<String,
	List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator();
	iter.hasNext();) {

	Map.Entry<String, List<OneBlockInfo>> one = iter.next();
	nodes.add(one.getKey());
	List<OneBlockInfo> blocksInNode = one.getValue();

	// for each block, copy it into validBlocks. Delete it from
	// blockToNodes so that the same block does not appear in
	// two different splits.
	for (OneBlockInfo oneblock : blocksInNode) {
	if (blockToNodes.containsKey(oneblock)) {
	validBlocks.add(oneblock);
	blockToNodes.remove(oneblock);
	curSplitSize += oneblock.length;

	// if the accumulated split size exceeds the maximum, then
	// create this split.
	if (maxSize != 0 && curSplitSize >= maxSize) {
	// create an input split and add it to the splits array
	addCreatedSplit(splits, nodes, validBlocks);
	curSplitSize = 0;
	validBlocks.clear();
	}
	}
	}
	// if there were any blocks left over and their combined size is
	// larger than minSplitNode, then combine them into one split.
	// Otherwise add them back to the unprocessed pool. It is likely
	// that they will be combined with other blocks from the
	// same rack later on.
	if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
	// create an input split and add it to the splits array
	addCreatedSplit(splits, nodes, validBlocks);
	} else {
	for (OneBlockInfo oneblock : validBlocks) {
	blockToNodes.put(oneblock, oneblock.hosts);
	}
	}
	validBlocks.clear();
	nodes.clear();
	curSplitSize = 0;
	}

	// if blocks in a rack are below the specified minimum size, then keep them
	// in 'overflow'. After the processing of all racks is complete, these
	// overflow blocks will be combined into splits.
	ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
	ArrayList<String> racks = new ArrayList<String>();

	// Process all racks over and over again until there is no more work to do.
	while (blockToNodes.size() > 0) {

	// Create one split for this rack before moving over to the next rack.
	// Come back to this rack after creating a single split for each of the
	// remaining racks.
	// Process one rack location at a time, Combine all possible blocks that
	// reside on this rack as one split. (constrained by minimum and maximum
	// split size).

	// iterate over all racks
	for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter =
	rackToBlocks.entrySet().iterator(); iter.hasNext();) {

	Map.Entry<String, List<OneBlockInfo>> one = iter.next();
	racks.add(one.getKey());
	List<OneBlockInfo> blocks = one.getValue();

	// for each block, copy it into validBlocks. Delete it from
	// blockToNodes so that the same block does not appear in
	// two different splits.
	boolean createdSplit = false;
	for (OneBlockInfo oneblock : blocks) {
	if (blockToNodes.containsKey(oneblock)) {
	validBlocks.add(oneblock);
	blockToNodes.remove(oneblock);
	curSplitSize += oneblock.length;

	// if the accumulated split size exceeds the maximum, then
	// create this split.
	if (maxSize != 0 && curSplitSize >= maxSize) {
	// create an input split and add it to the splits array
	addCreatedSplit(splits, getHosts(racks), validBlocks);
	createdSplit = true;
	break;
	}
	}
	}

	// if we created a split, then just go to the next rack
	if (createdSplit) {
	curSplitSize = 0;
	validBlocks.clear();
	racks.clear();
	continue;
	}

	if (!validBlocks.isEmpty()) {
	if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
	// if there is a minimum size specified, then create a single split
	// otherwise, store these blocks into overflow data structure
	addCreatedSplit(splits, getHosts(racks), validBlocks);
	} else {
	// There were a few blocks in this rack that
	// remained to be processed. Keep them in 'overflow' block list.
	// These will be combined later.
	overflowBlocks.addAll(validBlocks);
	}
	}
	curSplitSize = 0;
	validBlocks.clear();
	racks.clear();
	}
	}

	assert blockToNodes.isEmpty();
	assert curSplitSize == 0;
	assert validBlocks.isEmpty();
	assert racks.isEmpty();

	// Process all overflow blocks
	for (OneBlockInfo oneblock : overflowBlocks) {
	validBlocks.add(oneblock);
	curSplitSize += oneblock.length;

	// This might cause an exiting rack location to be re-added,
	// but it should be ok.
	for (int i = 0; i < oneblock.racks.length; i++) {
	racks.add(oneblock.racks[i]);
	}

	// if the accumulated split size exceeds the maximum, then
	// create this split.
	if (maxSize != 0 && curSplitSize >= maxSize) {
	// create an input split and add it to the splits array
	addCreatedSplit(splits, getHosts(racks), validBlocks);
	curSplitSize = 0;
	validBlocks.clear();
	racks.clear();
	}
	}

	// Process any remaining blocks, if any.
	if (!validBlocks.isEmpty()) {
	addCreatedSplit(splits, getHosts(racks), validBlocks);
	}
	}

	/**
	* Create a single split from the list of blocks specified in validBlocks
	* Add this new split into splitList.
	*/
	private void addCreatedSplit(List<InputSplit> splitList,
	List<String> locations,
	ArrayList<OneBlockInfo> validBlocks) {
	// create an input split
	Path[] fl = new Path[validBlocks.size()];
	long[] offset = new long[validBlocks.size()];
	long[] length = new long[validBlocks.size()];
	for (int i = 0; i < validBlocks.size(); i++) {
	fl[i] = validBlocks.get(i).onepath;
	offset[i] = validBlocks.get(i).offset;
	length[i] = validBlocks.get(i).length;
	}

	// add this split to the list that is returned
	CombineFileSplit thissplit = new CombineFileSplit(fl, offset,
	length, locations.toArray(new String[0]));
	splitList.add(thissplit);
	}

	/**
	* This is not implemented yet.
	*/
	public abstract RecordReader<K, V> createRecordReader(InputSplit split,
	TaskAttemptContext context) throws IOException;

	/**
	* information about one file from the File System
	*/
	private static class OneFileInfo {
	private long fileSize; // size of the file
	private OneBlockInfo[] blocks; // all blocks in this file

	OneFileInfo(Path path, Configuration conf,
	HashMap<String, List<OneBlockInfo>> rackToBlocks,
	HashMap<OneBlockInfo, String[]> blockToNodes,
	HashMap<String, List<OneBlockInfo>> nodeToBlocks)
	throws IOException {
	this.fileSize = 0;

	// get block locations from file system
	FileSystem fs = path.getFileSystem(conf);
	FileStatus stat = fs.getFileStatus(path);
	BlockLocation[] locations = fs.getFileBlockLocations(stat, 0,
	stat.getLen());
	// create a list of all block and their locations
	if (locations == null) {
	blocks = new OneBlockInfo[0];
	} else {
	blocks = new OneBlockInfo[locations.length];
	for (int i = 0; i < locations.length; i++) {

	fileSize += locations[i].getLength();
	OneBlockInfo oneblock = new OneBlockInfo(path,
	locations[i].getOffset(),
	locations[i].getLength(),
	locations[i].getHosts(),
	locations[i].getTopologyPaths());
	blocks[i] = oneblock;

	// add this block to the block --> node locations map
	blockToNodes.put(oneblock, oneblock.hosts);

	// add this block to the rack --> block map
	for (int j = 0; j < oneblock.racks.length; j++) {
	String rack = oneblock.racks[j];
	List<OneBlockInfo> blklist = rackToBlocks.get(rack);
	if (blklist == null) {
	blklist = new ArrayList<OneBlockInfo>();
	rackToBlocks.put(rack, blklist);
	}
	blklist.add(oneblock);
	// Add this host to rackToNodes map
	addHostToRack(oneblock.racks[j], oneblock.hosts[j]);
	}

	// add this block to the node --> block map
	for (int j = 0; j < oneblock.hosts.length; j++) {
	String node = oneblock.hosts[j];
	List<OneBlockInfo> blklist = nodeToBlocks.get(node);
	if (blklist == null) {
	blklist = new ArrayList<OneBlockInfo>();
	nodeToBlocks.put(node, blklist);
	}
	blklist.add(oneblock);
	}
	}
	}
	}

	long getLength() {
	return fileSize;
	}

	OneBlockInfo[] getBlocks() {
	return blocks;
	}
	}

	/**
	* information about one block from the File System
	*/
	private static class OneBlockInfo {
	Path onepath; // name of this file
	long offset; // offset in file
	long length; // length of this block
	String[] hosts; // nodes on which this block resides
	String[] racks; // network topology of hosts

	OneBlockInfo(Path path, long offset, long len,
	String[] hosts, String[] topologyPaths) {
	this.onepath = path;
	this.offset = offset;
	this.hosts = hosts;
	this.length = len;
	assert (hosts.length == topologyPaths.length \|\|
	topologyPaths.length == 0);

	// if the file system does not have any rack information, then
	// use dummy rack location.
	if (topologyPaths.length == 0) {
	topologyPaths = new String[hosts.length];
	for (int i = 0; i < topologyPaths.length; i++) {
	topologyPaths[i] = (new NodeBase(hosts[i],
	NetworkTopology.DEFAULT_RACK)).toString();
	}
	}

	// The topology paths have the host name included as the last
	// component. Strip it.
	this.racks = new String[topologyPaths.length];
	for (int i = 0; i < topologyPaths.length; i++) {
	this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
	}
	}
	}

	private static void addHostToRack(String rack, String host) {
	Set<String> hosts = rackToNodes.get(rack);
	if (hosts == null) {
	hosts = new HashSet<String>();
	rackToNodes.put(rack, hosts);
	}
	hosts.add(host);
	}

	private static List<String> getHosts(List<String> racks) {
	List<String> hosts = new ArrayList<String>();
	for (String rack : racks) {
	hosts.addAll(rackToNodes.get(rack));
	}
	return hosts;
	}

	/**
	* Accept a path only if any one of filters given in the
	* constructor do.
	*/
	private static class MultiPathFilter implements PathFilter {
	private List<PathFilter> filters;

	public MultiPathFilter() {
	this.filters = new ArrayList<PathFilter>();
	}

	public MultiPathFilter(List<PathFilter> filters) {
	this.filters = filters;
	}

	public void add(PathFilter one) {
	filters.add(one);
	}

	public boolean accept(Path path) {
	for (PathFilter filter : filters) {
	if (filter.accept(path)) {
	return true;
	}
	}
	return false;
	}

	public String toString() {
	StringBuffer buf = new StringBuffer();
	buf.append("[");
	for (PathFilter f: filters) {
	buf.append(f);
	buf.append(",");
	}
	buf.append("]");
	return buf.toString();
	}
	}
	}