| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.mapreduce; |
| |
| import java.io.IOException; |
| |
| import org.apache.hadoop.classification.InterfaceAudience; |
| import org.apache.hadoop.classification.InterfaceStability; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.RawComparator; |
| import org.apache.hadoop.io.compress.CompressionCodec; |
| import org.apache.hadoop.mapreduce.task.MapContextImpl; |
| |
| /** |
| * Maps input key/value pairs to a set of intermediate key/value pairs. |
| * |
| * <p>Maps are the individual tasks which transform input records into a |
| * intermediate records. The transformed intermediate records need not be of |
| * the same type as the input records. A given input pair may map to zero or |
| * many output pairs.</p> |
| * |
| * <p>The Hadoop Map-Reduce framework spawns one map task for each |
| * {@link InputSplit} generated by the {@link InputFormat} for the job. |
| * <code>Mapper</code> implementations can access the {@link Configuration} for |
| * the job via the {@link JobContext#getConfiguration()}. |
| * |
| * <p>The framework first calls |
| * {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by |
| * {@link #map(Object, Object, Context)} |
| * for each key/value pair in the <code>InputSplit</code>. Finally |
| * {@link #cleanup(Context)} is called.</p> |
| * |
| * <p>All intermediate values associated with a given output key are |
| * subsequently grouped by the framework, and passed to a {@link Reducer} to |
| * determine the final output. Users can control the sorting and grouping by |
| * specifying two key {@link RawComparator} classes.</p> |
| * |
| * <p>The <code>Mapper</code> outputs are partitioned per |
| * <code>Reducer</code>. Users can control which keys (and hence records) go to |
| * which <code>Reducer</code> by implementing a custom {@link Partitioner}. |
| * |
| * <p>Users can optionally specify a <code>combiner</code>, via |
| * {@link Job#setCombinerClass(Class)}, to perform local aggregation of the |
| * intermediate outputs, which helps to cut down the amount of data transferred |
| * from the <code>Mapper</code> to the <code>Reducer</code>. |
| * |
| * <p>Applications can specify if and how the intermediate |
| * outputs are to be compressed and which {@link CompressionCodec}s are to be |
| * used via the <code>Configuration</code>.</p> |
| * |
| * <p>If the job has zero |
| * reduces then the output of the <code>Mapper</code> is directly written |
| * to the {@link OutputFormat} without sorting by keys.</p> |
| * |
| * <p>Example:</p> |
| * <p><blockquote><pre> |
| * public class TokenCounterMapper |
| * extends Mapper<Object, Text, Text, IntWritable>{ |
| * |
| * private final static IntWritable one = new IntWritable(1); |
| * private Text word = new Text(); |
| * |
| * public void map(Object key, Text value, Context context) throws IOException, InterruptedException { |
| * StringTokenizer itr = new StringTokenizer(value.toString()); |
| * while (itr.hasMoreTokens()) { |
| * word.set(itr.nextToken()); |
| * context.write(word, one); |
| * } |
| * } |
| * } |
| * </pre></blockquote></p> |
| * |
| * <p>Applications may override the {@link #run(Context)} method to exert |
| * greater control on map processing e.g. multi-threaded <code>Mapper</code>s |
| * etc.</p> |
| * |
| * @see InputFormat |
| * @see JobContext |
| * @see Partitioner |
| * @see Reducer |
| */ |
| @InterfaceAudience.Public |
| @InterfaceStability.Stable |
| public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> { |
| |
| /** |
| * The <code>Context</code> passed on to the {@link Mapper} implementations. |
| */ |
| public abstract class Context |
| implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> { |
| } |
| |
| /** |
| * Called once at the beginning of the task. |
| */ |
| protected void setup(Context context |
| ) throws IOException, InterruptedException { |
| // NOTHING |
| } |
| |
| /** |
| * Called once for each key/value pair in the input split. Most applications |
| * should override this, but the default is the identity function. |
| */ |
| @SuppressWarnings("unchecked") |
| protected void map(KEYIN key, VALUEIN value, |
| Context context) throws IOException, InterruptedException { |
| context.write((KEYOUT) key, (VALUEOUT) value); |
| } |
| |
| /** |
| * Called once at the end of the task. |
| */ |
| protected void cleanup(Context context |
| ) throws IOException, InterruptedException { |
| // NOTHING |
| } |
| |
| /** |
| * Expert users can override this method for more complete control over the |
| * execution of the Mapper. |
| * @param context |
| * @throws IOException |
| */ |
| public void run(Context context) throws IOException, InterruptedException { |
| setup(context); |
| while (context.nextKeyValue()) { |
| map(context.getCurrentKey(), context.getCurrentValue(), context); |
| } |
| cleanup(context); |
| } |
| } |