| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.mapreduce; |
| |
| import java.io.IOException; |
| import java.util.List; |
| |
| import org.apache.hadoop.classification.InterfaceAudience; |
| import org.apache.hadoop.classification.InterfaceStability; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
| |
| /** |
| * <code>InputFormat</code> describes the input-specification for a |
| * Map-Reduce job. |
| * |
| * <p>The Map-Reduce framework relies on the <code>InputFormat</code> of the |
| * job to:<p> |
| * <ol> |
| * <li> |
| * Validate the input-specification of the job. |
| * <li> |
| * Split-up the input file(s) into logical {@link InputSplit}s, each of |
| * which is then assigned to an individual {@link Mapper}. |
| * </li> |
| * <li> |
| * Provide the {@link RecordReader} implementation to be used to glean |
| * input records from the logical <code>InputSplit</code> for processing by |
| * the {@link Mapper}. |
| * </li> |
| * </ol> |
| * |
| * <p>The default behavior of file-based {@link InputFormat}s, typically |
| * sub-classes of {@link FileInputFormat}, is to split the |
| * input into <i>logical</i> {@link InputSplit}s based on the total size, in |
| * bytes, of the input files. However, the {@link FileSystem} blocksize of |
| * the input files is treated as an upper bound for input splits. A lower bound |
| * on the split size can be set via |
| * <a href="{@docRoot}/../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml#mapreduce.input.fileinputformat.split.minsize"> |
| * mapreduce.input.fileinputformat.split.minsize</a>.</p> |
| * |
| * <p>Clearly, logical splits based on input-size is insufficient for many |
| * applications since record boundaries are to respected. In such cases, the |
| * application has to also implement a {@link RecordReader} on whom lies the |
| * responsibility to respect record-boundaries and present a record-oriented |
| * view of the logical <code>InputSplit</code> to the individual task. |
| * |
| * @see InputSplit |
| * @see RecordReader |
| * @see FileInputFormat |
| */ |
| @InterfaceAudience.Public |
| @InterfaceStability.Stable |
| public abstract class InputFormat<K, V> { |
| |
| /** |
| * Logically split the set of input files for the job. |
| * |
| * <p>Each {@link InputSplit} is then assigned to an individual {@link Mapper} |
| * for processing.</p> |
| * |
| * <p><i>Note</i>: The split is a <i>logical</i> split of the inputs and the |
| * input files are not physically split into chunks. For e.g. a split could |
| * be <i><input-file-path, start, offset></i> tuple. The InputFormat |
| * also creates the {@link RecordReader} to read the {@link InputSplit}. |
| * |
| * @param context job configuration. |
| * @return an array of {@link InputSplit}s for the job. |
| */ |
| public abstract |
| List<InputSplit> getSplits(JobContext context |
| ) throws IOException, InterruptedException; |
| |
| /** |
| * Create a record reader for a given split. The framework will call |
| * {@link RecordReader#initialize(InputSplit, TaskAttemptContext)} before |
| * the split is used. |
| * @param split the split to be read |
| * @param context the information about the task |
| * @return a new record reader |
| * @throws IOException |
| * @throws InterruptedException |
| */ |
| public abstract |
| RecordReader<K,V> createRecordReader(InputSplit split, |
| TaskAttemptContext context |
| ) throws IOException, |
| InterruptedException; |
| |
| } |
| |