src/java/org/apache/hadoop/mapreduce/Mapper.java - hadoop-mapreduce - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.mapreduce;

 import java.io.IOException;

 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.mapreduce.task.MapContextImpl;

 /**
  * Maps input key/value pairs to a set of intermediate key/value pairs.
  *
  * <p>Maps are the individual tasks which transform input records into a
  * intermediate records. The transformed intermediate records need not be of
  * the same type as the input records. A given input pair may map to zero or
  * many output pairs.</p>
  *
  * <p>The Hadoop Map-Reduce framework spawns one map task for each
  * {@link InputSplit} generated by the {@link InputFormat} for the job.
  * <code>Mapper</code> implementations can access the {@link Configuration} for
  * the job via the {@link JobContext#getConfiguration()}.
  *
  * <p>The framework first calls
  * {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by
  * {@link #map(Object, Object, Context)}
  * for each key/value pair in the <code>InputSplit</code>. Finally
  * {@link #cleanup(Context)} is called.</p>
  *
  * <p>All intermediate values associated with a given output key are
  * subsequently grouped by the framework, and passed to a {@link Reducer} to
  * determine the final output. Users can control the sorting and grouping by
  * specifying two key {@link RawComparator} classes.</p>
  *
  * <p>The <code>Mapper</code> outputs are partitioned per
  * <code>Reducer</code>. Users can control which keys (and hence records) go to
  * which <code>Reducer</code> by implementing a custom {@link Partitioner}.
  *
  * <p>Users can optionally specify a <code>combiner</code>, via
  * {@link Job#setCombinerClass(Class)}, to perform local aggregation of the
  * intermediate outputs, which helps to cut down the amount of data transferred
  * from the <code>Mapper</code> to the <code>Reducer</code>.
  *
  * <p>Applications can specify if and how the intermediate
  * outputs are to be compressed and which {@link CompressionCodec}s are to be
  * used via the <code>Configuration</code>.</p>
  *
  * <p>If the job has zero
  * reduces then the output of the <code>Mapper</code> is directly written
  * to the {@link OutputFormat} without sorting by keys.</p>
  *
  * <p>Example:</p>
  * <p><blockquote><pre>
  * public class TokenCounterMapper
  *     extends Mapper&lt;Object, Text, Text, IntWritable&gt;{
  *
  *   private final static IntWritable one = new IntWritable(1);
  *   private Text word = new Text();
  *
  *   public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  *     StringTokenizer itr = new StringTokenizer(value.toString());
  *     while (itr.hasMoreTokens()) {
  *       word.set(itr.nextToken());
  *       context.write(word, one);
  *     }
  *   }
  * }
  * </pre></blockquote></p>
  *
  * <p>Applications may override the {@link #run(Context)} method to exert
  * greater control on map processing e.g. multi-threaded <code>Mapper</code>s
  * etc.</p>
  *
  * @see InputFormat
  * @see JobContext
  * @see Partitioner
  * @see Reducer
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
 public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

   /**
    * The <code>Context</code> passed on to the {@link Mapper} implementations.
    */
   public abstract class Context
     implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
   }

   /**
    * Called once at the beginning of the task.
    */
   protected void setup(Context context
                        ) throws IOException, InterruptedException {
     // NOTHING
   }

   /**
    * Called once for each key/value pair in the input split. Most applications
    * should override this, but the default is the identity function.
    */
   @SuppressWarnings("unchecked")
   protected void map(KEYIN key, VALUEIN value,
                      Context context) throws IOException, InterruptedException {
     context.write((KEYOUT) key, (VALUEOUT) value);
   }

   /**
    * Called once at the end of the task.
    */
   protected void cleanup(Context context
                          ) throws IOException, InterruptedException {
     // NOTHING
   }

   /**
    * Expert users can override this method for more complete control over the
    * execution of the Mapper.
    * @param context
    * @throws IOException
    */
   public void run(Context context) throws IOException, InterruptedException {
     setup(context);
     while (context.nextKeyValue()) {
       map(context.getCurrentKey(), context.getCurrentValue(), context);
     }
     cleanup(context);
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.mapreduce;

	import java.io.IOException;

	import org.apache.hadoop.classification.InterfaceAudience;
	import org.apache.hadoop.classification.InterfaceStability;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.RawComparator;
	import org.apache.hadoop.io.compress.CompressionCodec;
	import org.apache.hadoop.mapreduce.task.MapContextImpl;

	/**
	* Maps input key/value pairs to a set of intermediate key/value pairs.
	*
	* <p>Maps are the individual tasks which transform input records into a
	* intermediate records. The transformed intermediate records need not be of
	* the same type as the input records. A given input pair may map to zero or
	* many output pairs.</p>
	*
	* <p>The Hadoop Map-Reduce framework spawns one map task for each
	* {@link InputSplit} generated by the {@link InputFormat} for the job.
	* <code>Mapper</code> implementations can access the {@link Configuration} for
	* the job via the {@link JobContext#getConfiguration()}.
	*
	* <p>The framework first calls
	* {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by
	* {@link #map(Object, Object, Context)}
	* for each key/value pair in the <code>InputSplit</code>. Finally
	* {@link #cleanup(Context)} is called.</p>
	*
	* <p>All intermediate values associated with a given output key are
	* subsequently grouped by the framework, and passed to a {@link Reducer} to
	* determine the final output. Users can control the sorting and grouping by
	* specifying two key {@link RawComparator} classes.</p>
	*
	* <p>The <code>Mapper</code> outputs are partitioned per
	* <code>Reducer</code>. Users can control which keys (and hence records) go to
	* which <code>Reducer</code> by implementing a custom {@link Partitioner}.
	*
	* <p>Users can optionally specify a <code>combiner</code>, via
	* {@link Job#setCombinerClass(Class)}, to perform local aggregation of the
	* intermediate outputs, which helps to cut down the amount of data transferred
	* from the <code>Mapper</code> to the <code>Reducer</code>.
	*
	* <p>Applications can specify if and how the intermediate
	* outputs are to be compressed and which {@link CompressionCodec}s are to be
	* used via the <code>Configuration</code>.</p>
	*
	* <p>If the job has zero
	* reduces then the output of the <code>Mapper</code> is directly written
	* to the {@link OutputFormat} without sorting by keys.</p>
	*
	* <p>Example:</p>
	* <p><blockquote><pre>
	* public class TokenCounterMapper
	* extends Mapper<Object, Text, Text, IntWritable>{
	*
	* private final static IntWritable one = new IntWritable(1);
	* private Text word = new Text();
	*
	* public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
	* StringTokenizer itr = new StringTokenizer(value.toString());
	* while (itr.hasMoreTokens()) {
	* word.set(itr.nextToken());
	* context.write(word, one);
	* }
	* }
	* }
	* </pre></blockquote></p>
	*
	* <p>Applications may override the {@link #run(Context)} method to exert
	* greater control on map processing e.g. multi-threaded <code>Mapper</code>s
	* etc.</p>
	*
	* @see InputFormat
	* @see JobContext
	* @see Partitioner
	* @see Reducer
	*/
	@InterfaceAudience.Public
	@InterfaceStability.Stable
	public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

	/**
	* The <code>Context</code> passed on to the {@link Mapper} implementations.
	*/
	public abstract class Context
	implements MapContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
	}

	/**
	* Called once at the beginning of the task.
	*/
	protected void setup(Context context
	) throws IOException, InterruptedException {
	// NOTHING
	}

	/**
	* Called once for each key/value pair in the input split. Most applications
	* should override this, but the default is the identity function.
	*/
	@SuppressWarnings("unchecked")
	protected void map(KEYIN key, VALUEIN value,
	Context context) throws IOException, InterruptedException {
	context.write((KEYOUT) key, (VALUEOUT) value);
	}

	/**
	* Called once at the end of the task.
	*/
	protected void cleanup(Context context
	) throws IOException, InterruptedException {
	// NOTHING
	}

	/**
	* Expert users can override this method for more complete control over the
	* execution of the Mapper.
	* @param context
	* @throws IOException
	*/
	public void run(Context context) throws IOException, InterruptedException {
	setup(context);
	while (context.nextKeyValue()) {
	map(context.getCurrentKey(), context.getCurrentValue(), context);
	}
	cleanup(context);
	}
	}