branch-0.23.1/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.tools.mapred.lib;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.mapreduce.*;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.tools.DistCpConstants;
 import org.apache.hadoop.tools.util.DistCpUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.FileStatus;

 import java.util.List;
 import java.util.ArrayList;
 import java.io.IOException;

 /**
  * DynamicInputFormat implements the "Worker pattern" for DistCp.
  * Rather than to split up the copy-list into a set of static splits,
  * the DynamicInputFormat does the following:
  * 1. Splits the copy-list into small chunks on the DFS.
  * 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
  *    as it can.
  * This arrangement ensures that a single slow mapper won't slow down the entire
  * job (since the slack will be picked up by other mappers, who consume more
  * chunks.)
  * By varying the split-ratio, one can vary chunk sizes to achieve different
  * performance characteristics.
  */
 public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
   private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);

   private static final String CONF_LABEL_LISTING_SPLIT_RATIO
           = "mapred.listing.split.ratio";
   private static final String CONF_LABEL_NUM_SPLITS
           = "mapred.num.splits";
   private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK
           = "mapred.num.entries.per.chunk";

   /**
    * Implementation of InputFormat::getSplits(). This method splits up the
    * copy-listing file into chunks, and assigns the first batch to different
    * tasks.
    * @param jobContext JobContext for the map job.
    * @return The list of (empty) dynamic input-splits.
    * @throws IOException, on failure.
    * @throws InterruptedException
    */
   @Override
   public List<InputSplit> getSplits(JobContext jobContext)
       throws IOException, InterruptedException {
     LOG.info("DynamicInputFormat: Getting splits for job:"
              + jobContext.getJobID());
     return createSplits(jobContext,
                         splitCopyListingIntoChunksWithShuffle(jobContext));
   }

   private List<InputSplit> createSplits(JobContext jobContext,
                                         List<DynamicInputChunk> chunks)
           throws IOException {
     int numMaps = getNumMapTasks(jobContext.getConfiguration());

     final int nSplits = Math.min(numMaps, chunks.size());
     List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);

     for (int i=0; i< nSplits; ++i) {
       TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
       chunks.get(i).assignTo(taskId);
       splits.add(new FileSplit(chunks.get(i).getPath(), 0,
           // Setting non-zero length for FileSplit size, to avoid a possible
           // future when 0-sized file-splits are considered "empty" and skipped
           // over.
           MIN_RECORDS_PER_CHUNK,
           null));
     }
     DistCpUtils.publish(jobContext.getConfiguration(),
                         CONF_LABEL_NUM_SPLITS, splits.size());
     return splits;
   }

   private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;

   private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle
                                     (JobContext context) throws IOException {

     final Configuration configuration = context.getConfiguration();
     int numRecords = getNumberOfRecords(configuration);
     int numMaps = getNumMapTasks(configuration);
     // Number of chunks each map will process, on average.
     int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
     validateNumChunksUsing(splitRatio, numMaps);

     int numEntriesPerChunk = (int)Math.ceil((float)numRecords
                                           /(splitRatio * numMaps));
     DistCpUtils.publish(context.getConfiguration(),
                         CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
                         numEntriesPerChunk);

     final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk);
     int nChunksOpenAtOnce
             = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);

     Path listingPath = getListingFilePath(configuration);
     SequenceFile.Reader reader
             = new SequenceFile.Reader(configuration,
                                       SequenceFile.Reader.file(listingPath));

     List<DynamicInputChunk> openChunks
                   = new ArrayList<DynamicInputChunk>();

     List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();

     FileStatus fileStatus = new FileStatus();
     Text relPath = new Text();
     int recordCounter = 0;
     int chunkCount = 0;

     try {

       while (reader.next(relPath, fileStatus)) {
         if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) {
           // All chunks full. Create new chunk-set.
           closeAll(openChunks);
           chunksFinal.addAll(openChunks);

           openChunks = createChunks(
                   configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce);

           chunkCount += openChunks.size();

           nChunksOpenAtOnce = openChunks.size();
           recordCounter = 0;
         }

         // Shuffle into open chunks.
         openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus);
         ++recordCounter;
       }

     } finally {
       closeAll(openChunks);
       chunksFinal.addAll(openChunks);
       IOUtils.closeStream(reader);
     }

     LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
     return chunksFinal;
   }

   private static void validateNumChunksUsing(int splitRatio, int numMaps)
                                               throws IOException {
     if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
       throw new IOException("Too many chunks created with splitRatio:"
                  + splitRatio + ", numMaps:" + numMaps
                  + ". Reduce numMaps or decrease split-ratio to proceed.");
   }

   private static void closeAll(List<DynamicInputChunk> chunks) {
     for (DynamicInputChunk chunk: chunks)
       chunk.close();
   }

   private static List<DynamicInputChunk> createChunks(Configuration config,
                       int chunkCount, int nChunksTotal, int nChunksOpenAtOnce)
                                           throws IOException {
     List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
     int chunkIdUpperBound
             = Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);

     // If there will be fewer than nChunksOpenAtOnce chunks left after
     // the current batch of chunks, fold the remaining chunks into
     // the current batch.
     if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
       chunkIdUpperBound = nChunksTotal;

     for (int i=chunkCount; i < chunkIdUpperBound; ++i)
       chunks.add(createChunk(i, config));
     return chunks;
   }

   private static DynamicInputChunk createChunk(int chunkId, Configuration config)
                                               throws IOException {
     return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId),
                                               config);
   }


   private static Path getListingFilePath(Configuration configuration) {
     String listingFilePathString = configuration.get(
             DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");

     assert !listingFilePathString.equals("") : "Listing file not found.";

     Path listingFilePath = new Path(listingFilePathString);
     try {
       assert listingFilePath.getFileSystem(configuration)
               .exists(listingFilePath) : "Listing file: " + listingFilePath +
                                           " not found.";
     } catch (IOException e) {
       assert false :   "Listing file: " + listingFilePath
                     + " couldn't be accessed. " + e.getMessage();
     }
     return listingFilePath;
   }

   private static int getNumberOfRecords(Configuration configuration) {
     return DistCpUtils.getInt(configuration,
                               DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
   }

   private static int getNumMapTasks(Configuration configuration) {
     return DistCpUtils.getInt(configuration,
                               JobContext.NUM_MAPS);
   }

   private static int getListingSplitRatio(Configuration configuration,
                                             int numMaps, int numPaths) {
     return configuration.getInt(
             CONF_LABEL_LISTING_SPLIT_RATIO,
             getSplitRatio(numMaps, numPaths));
   }

   private static final int MAX_CHUNKS_TOLERABLE = 400;
   private static final int MAX_CHUNKS_IDEAL     = 100;
   private static final int MIN_RECORDS_PER_CHUNK = 5;
   private static final int SPLIT_RATIO_DEFAULT  = 2;

   /**
    * Package private, for testability.
    * @param nMaps The number of maps requested for.
    * @param nRecords The number of records to be copied.
    * @return The number of splits each map should handle, ideally.
    */
   static int getSplitRatio(int nMaps, int nRecords) {
     if (nMaps == 1) {
       LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
       return 1;
     }

     if (nMaps > MAX_CHUNKS_IDEAL)
       return SPLIT_RATIO_DEFAULT;

     int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps);
     int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups));

     return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ?
               SPLIT_RATIO_DEFAULT : nPickups;
   }

   static int getNumEntriesPerChunk(Configuration configuration) {
     return DistCpUtils.getInt(configuration,
                               CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
   }


   /**
    * Implementation of Inputformat::createRecordReader().
    * @param inputSplit The split for which the RecordReader is required.
    * @param taskAttemptContext TaskAttemptContext for the current attempt.
    * @return DynamicRecordReader instance.
    * @throws IOException, on failure.
    * @throws InterruptedException
    */
   @Override
   public RecordReader<K, V> createRecordReader(
           InputSplit inputSplit,
           TaskAttemptContext taskAttemptContext)
           throws IOException, InterruptedException {
     return new DynamicRecordReader<K, V>();
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.tools.mapred.lib;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.hadoop.mapreduce.*;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.hadoop.tools.DistCpConstants;
	import org.apache.hadoop.tools.util.DistCpUtils;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.IOUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.fs.FileStatus;

	import java.util.List;
	import java.util.ArrayList;
	import java.io.IOException;

	/**
	* DynamicInputFormat implements the "Worker pattern" for DistCp.
	* Rather than to split up the copy-list into a set of static splits,
	* the DynamicInputFormat does the following:
	* 1. Splits the copy-list into small chunks on the DFS.
	* 2. Creates a set of empty "dynamic" splits, that each consume as many chunks
	* as it can.
	* This arrangement ensures that a single slow mapper won't slow down the entire
	* job (since the slack will be picked up by other mappers, who consume more
	* chunks.)
	* By varying the split-ratio, one can vary chunk sizes to achieve different
	* performance characteristics.
	*/
	public class DynamicInputFormat<K, V> extends InputFormat<K, V> {
	private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class);

	private static final String CONF_LABEL_LISTING_SPLIT_RATIO
	= "mapred.listing.split.ratio";
	private static final String CONF_LABEL_NUM_SPLITS
	= "mapred.num.splits";
	private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK
	= "mapred.num.entries.per.chunk";

	/**
	* Implementation of InputFormat::getSplits(). This method splits up the
	* copy-listing file into chunks, and assigns the first batch to different
	* tasks.
	* @param jobContext JobContext for the map job.
	* @return The list of (empty) dynamic input-splits.
	* @throws IOException, on failure.
	* @throws InterruptedException
	*/
	@Override
	public List<InputSplit> getSplits(JobContext jobContext)
	throws IOException, InterruptedException {
	LOG.info("DynamicInputFormat: Getting splits for job:"
	+ jobContext.getJobID());
	return createSplits(jobContext,
	splitCopyListingIntoChunksWithShuffle(jobContext));
	}

	private List<InputSplit> createSplits(JobContext jobContext,
	List<DynamicInputChunk> chunks)
	throws IOException {
	int numMaps = getNumMapTasks(jobContext.getConfiguration());

	final int nSplits = Math.min(numMaps, chunks.size());
	List<InputSplit> splits = new ArrayList<InputSplit>(nSplits);

	for (int i=0; i< nSplits; ++i) {
	TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i);
	chunks.get(i).assignTo(taskId);
	splits.add(new FileSplit(chunks.get(i).getPath(), 0,
	// Setting non-zero length for FileSplit size, to avoid a possible
	// future when 0-sized file-splits are considered "empty" and skipped
	// over.
	MIN_RECORDS_PER_CHUNK,
	null));
	}
	DistCpUtils.publish(jobContext.getConfiguration(),
	CONF_LABEL_NUM_SPLITS, splits.size());
	return splits;
	}

	private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16;

	private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle
	(JobContext context) throws IOException {

	final Configuration configuration = context.getConfiguration();
	int numRecords = getNumberOfRecords(configuration);
	int numMaps = getNumMapTasks(configuration);
	// Number of chunks each map will process, on average.
	int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
	validateNumChunksUsing(splitRatio, numMaps);

	int numEntriesPerChunk = (int)Math.ceil((float)numRecords
	/(splitRatio * numMaps));
	DistCpUtils.publish(context.getConfiguration(),
	CONF_LABEL_NUM_ENTRIES_PER_CHUNK,
	numEntriesPerChunk);

	final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk);
	int nChunksOpenAtOnce
	= Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);

	Path listingPath = getListingFilePath(configuration);
	SequenceFile.Reader reader
	= new SequenceFile.Reader(configuration,
	SequenceFile.Reader.file(listingPath));

	List<DynamicInputChunk> openChunks
	= new ArrayList<DynamicInputChunk>();

	List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();

	FileStatus fileStatus = new FileStatus();
	Text relPath = new Text();
	int recordCounter = 0;
	int chunkCount = 0;

	try {

	while (reader.next(relPath, fileStatus)) {
	if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) {
	// All chunks full. Create new chunk-set.
	closeAll(openChunks);
	chunksFinal.addAll(openChunks);

	openChunks = createChunks(
	configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce);

	chunkCount += openChunks.size();

	nChunksOpenAtOnce = openChunks.size();
	recordCounter = 0;
	}

	// Shuffle into open chunks.
	openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus);
	++recordCounter;
	}

	} finally {
	closeAll(openChunks);
	chunksFinal.addAll(openChunks);
	IOUtils.closeStream(reader);
	}

	LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
	return chunksFinal;
	}

	private static void validateNumChunksUsing(int splitRatio, int numMaps)
	throws IOException {
	if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE)
	throw new IOException("Too many chunks created with splitRatio:"
	+ splitRatio + ", numMaps:" + numMaps
	+ ". Reduce numMaps or decrease split-ratio to proceed.");
	}

	private static void closeAll(List<DynamicInputChunk> chunks) {
	for (DynamicInputChunk chunk: chunks)
	chunk.close();
	}

	private static List<DynamicInputChunk> createChunks(Configuration config,
	int chunkCount, int nChunksTotal, int nChunksOpenAtOnce)
	throws IOException {
	List<DynamicInputChunk> chunks = new ArrayList<DynamicInputChunk>();
	int chunkIdUpperBound
	= Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce);

	// If there will be fewer than nChunksOpenAtOnce chunks left after
	// the current batch of chunks, fold the remaining chunks into
	// the current batch.
	if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce)
	chunkIdUpperBound = nChunksTotal;

	for (int i=chunkCount; i < chunkIdUpperBound; ++i)
	chunks.add(createChunk(i, config));
	return chunks;
	}

	private static DynamicInputChunk createChunk(int chunkId, Configuration config)
	throws IOException {
	return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId),
	config);
	}


	private static Path getListingFilePath(Configuration configuration) {
	String listingFilePathString = configuration.get(
	DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");

	assert !listingFilePathString.equals("") : "Listing file not found.";

	Path listingFilePath = new Path(listingFilePathString);
	try {
	assert listingFilePath.getFileSystem(configuration)
	.exists(listingFilePath) : "Listing file: " + listingFilePath +
	" not found.";
	} catch (IOException e) {
	assert false : "Listing file: " + listingFilePath
	+ " couldn't be accessed. " + e.getMessage();
	}
	return listingFilePath;
	}

	private static int getNumberOfRecords(Configuration configuration) {
	return DistCpUtils.getInt(configuration,
	DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
	}

	private static int getNumMapTasks(Configuration configuration) {
	return DistCpUtils.getInt(configuration,
	JobContext.NUM_MAPS);
	}

	private static int getListingSplitRatio(Configuration configuration,
	int numMaps, int numPaths) {
	return configuration.getInt(
	CONF_LABEL_LISTING_SPLIT_RATIO,
	getSplitRatio(numMaps, numPaths));
	}

	private static final int MAX_CHUNKS_TOLERABLE = 400;
	private static final int MAX_CHUNKS_IDEAL = 100;
	private static final int MIN_RECORDS_PER_CHUNK = 5;
	private static final int SPLIT_RATIO_DEFAULT = 2;

	/**
	* Package private, for testability.
	* @param nMaps The number of maps requested for.
	* @param nRecords The number of records to be copied.
	* @return The number of splits each map should handle, ideally.
	*/
	static int getSplitRatio(int nMaps, int nRecords) {
	if (nMaps == 1) {
	LOG.warn("nMaps == 1. Why use DynamicInputFormat?");
	return 1;
	}

	if (nMaps > MAX_CHUNKS_IDEAL)
	return SPLIT_RATIO_DEFAULT;

	int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps);
	int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups));

	return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ?
	SPLIT_RATIO_DEFAULT : nPickups;
	}

	static int getNumEntriesPerChunk(Configuration configuration) {
	return DistCpUtils.getInt(configuration,
	CONF_LABEL_NUM_ENTRIES_PER_CHUNK);
	}


	/**
	* Implementation of Inputformat::createRecordReader().
	* @param inputSplit The split for which the RecordReader is required.
	* @param taskAttemptContext TaskAttemptContext for the current attempt.
	* @return DynamicRecordReader instance.
	* @throws IOException, on failure.
	* @throws InterruptedException
	*/
	@Override
	public RecordReader<K, V> createRecordReader(
	InputSplit inputSplit,
	TaskAttemptContext taskAttemptContext)
	throws IOException, InterruptedException {
	return new DynamicRecordReader<K, V>();
	}
	}