hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/MultiFileInputFormat.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.mapred;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;

 /**
  * An abstract {@link InputFormat} that returns {@link MultiFileSplit}'s
  * in {@link #getSplits(JobConf, int)} method. Splits are constructed from
  * the files under the input paths. Each split returned contains <i>nearly</i>
  * equal content length. <br>
  * Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
  * to construct <code>RecordReader</code>'s for <code>MultiFileSplit</code>'s.
  * @see MultiFileSplit
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
 public abstract class MultiFileInputFormat<K, V>
   extends FileInputFormat<K, V> {

   @Override
   public InputSplit[] getSplits(JobConf job, int numSplits)
     throws IOException {

     Path[] paths = FileUtil.stat2Paths(listStatus(job));
     List<MultiFileSplit> splits = new ArrayList<MultiFileSplit>(Math.min(numSplits, paths.length));
     if (paths.length != 0) {
       // HADOOP-1818: Manage splits only if there are paths
       long[] lengths = new long[paths.length];
       long totLength = 0;
       for(int i=0; i<paths.length; i++) {
         FileSystem fs = paths[i].getFileSystem(job);
         lengths[i] = fs.getContentSummary(paths[i]).getLength();
         totLength += lengths[i];
       }
       double avgLengthPerSplit = ((double)totLength) / numSplits;
       long cumulativeLength = 0;

       int startIndex = 0;

       for(int i=0; i<numSplits; i++) {
         int splitSize = findSize(i, avgLengthPerSplit, cumulativeLength
             , startIndex, lengths);
         if (splitSize != 0) {
           // HADOOP-1818: Manage split only if split size is not equals to 0
           Path[] splitPaths = new Path[splitSize];
           long[] splitLengths = new long[splitSize];
           System.arraycopy(paths, startIndex, splitPaths , 0, splitSize);
           System.arraycopy(lengths, startIndex, splitLengths , 0, splitSize);
           splits.add(new MultiFileSplit(job, splitPaths, splitLengths));
           startIndex += splitSize;
           for(long l: splitLengths) {
             cumulativeLength += l;
           }
         }
       }
     }
     return splits.toArray(new MultiFileSplit[splits.size()]);
   }

   private int findSize(int splitIndex, double avgLengthPerSplit
       , long cumulativeLength , int startIndex, long[] lengths) {

     if(splitIndex == lengths.length - 1)
       return lengths.length - startIndex;

     long goalLength = (long)((splitIndex + 1) * avgLengthPerSplit);
     long partialLength = 0;
     // accumulate till just above the goal length;
     for(int i = startIndex; i < lengths.length; i++) {
       partialLength += lengths[i];
       if(partialLength + cumulativeLength >= goalLength) {
         return i - startIndex + 1;
       }
     }
     return lengths.length - startIndex;
   }

   @Override
   public abstract RecordReader<K, V> getRecordReader(InputSplit split,
       JobConf job, Reporter reporter)
       throws IOException;
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.mapred;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.hadoop.classification.InterfaceAudience;
	import org.apache.hadoop.classification.InterfaceStability;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.FileUtil;
	import org.apache.hadoop.fs.Path;

	/**
	* An abstract {@link InputFormat} that returns {@link MultiFileSplit}'s
	* in {@link #getSplits(JobConf, int)} method. Splits are constructed from
	* the files under the input paths. Each split returned contains <i>nearly</i>
	* equal content length. <br>
	* Subclasses implement {@link #getRecordReader(InputSplit, JobConf, Reporter)}
	* to construct <code>RecordReader</code>'s for <code>MultiFileSplit</code>'s.
	* @see MultiFileSplit
	*/
	@InterfaceAudience.Public
	@InterfaceStability.Stable
	public abstract class MultiFileInputFormat<K, V>
	extends FileInputFormat<K, V> {

	@Override
	public InputSplit[] getSplits(JobConf job, int numSplits)
	throws IOException {

	Path[] paths = FileUtil.stat2Paths(listStatus(job));
	List<MultiFileSplit> splits = new ArrayList<MultiFileSplit>(Math.min(numSplits, paths.length));
	if (paths.length != 0) {
	// HADOOP-1818: Manage splits only if there are paths
	long[] lengths = new long[paths.length];
	long totLength = 0;
	for(int i=0; i<paths.length; i++) {
	FileSystem fs = paths[i].getFileSystem(job);
	lengths[i] = fs.getContentSummary(paths[i]).getLength();
	totLength += lengths[i];
	}
	double avgLengthPerSplit = ((double)totLength) / numSplits;
	long cumulativeLength = 0;

	int startIndex = 0;

	for(int i=0; i<numSplits; i++) {
	int splitSize = findSize(i, avgLengthPerSplit, cumulativeLength
	, startIndex, lengths);
	if (splitSize != 0) {
	// HADOOP-1818: Manage split only if split size is not equals to 0
	Path[] splitPaths = new Path[splitSize];
	long[] splitLengths = new long[splitSize];
	System.arraycopy(paths, startIndex, splitPaths , 0, splitSize);
	System.arraycopy(lengths, startIndex, splitLengths , 0, splitSize);
	splits.add(new MultiFileSplit(job, splitPaths, splitLengths));
	startIndex += splitSize;
	for(long l: splitLengths) {
	cumulativeLength += l;
	}
	}
	}
	}
	return splits.toArray(new MultiFileSplit[splits.size()]);
	}

	private int findSize(int splitIndex, double avgLengthPerSplit
	, long cumulativeLength , int startIndex, long[] lengths) {

	if(splitIndex == lengths.length - 1)
	return lengths.length - startIndex;

	long goalLength = (long)((splitIndex + 1) * avgLengthPerSplit);
	long partialLength = 0;
	// accumulate till just above the goal length;
	for(int i = startIndex; i < lengths.length; i++) {
	partialLength += lengths[i];
	if(partialLength + cumulativeLength >= goalLength) {
	return i - startIndex + 1;
	}
	}
	return lengths.length - startIndex;
	}

	@Override
	public abstract RecordReader<K, V> getRecordReader(InputSplit split,
	JobConf job, Reporter reporter)
	throws IOException;
	}