hadoop-mapreduce/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapper.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.streaming;

 import java.io.*;
 import java.net.URLDecoder;

 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.SkipBadRecords;
 import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.streaming.io.InputWriter;
 import org.apache.hadoop.streaming.io.OutputReader;
 import org.apache.hadoop.streaming.io.TextInputWriter;

 /** A generic Mapper bridge.
  *  It delegates operations to an external program via stdin and stdout.
  */
 public class PipeMapper extends PipeMapRed implements Mapper {

   private boolean ignoreKey = false;
   private boolean skipping = false;

   private byte[] mapOutputFieldSeparator;
   private byte[] mapInputFieldSeparator;
   private int numOfMapOutputKeyFields = 1;

   String getPipeCommand(JobConf job) {
     String str = job.get("stream.map.streamprocessor");
     if (str == null) {
       return str;
     }
     try {
       return URLDecoder.decode(str, "UTF-8");
     }
     catch (UnsupportedEncodingException e) {
       System.err.println("stream.map.streamprocessor in jobconf not found");
       return null;
     }
   }

   boolean getDoPipe() {
     return true;
   }

   public void configure(JobConf job) {
     super.configure(job);
     //disable the auto increment of the counter. For streaming, no of
     //processed records could be different(equal or less) than the no of
     //records input.
     SkipBadRecords.setAutoIncrMapperProcCount(job, false);
     skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false);
     if (mapInputWriterClass_.getCanonicalName().equals(TextInputWriter.class.getCanonicalName())) {
       String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName();
       ignoreKey = job.getBoolean("stream.map.input.ignoreKey",
         inputFormatClassName.equals(TextInputFormat.class.getCanonicalName()));
     }

     try {
       mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8");
       mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8");
       numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1);
     } catch (UnsupportedEncodingException e) {
       throw new RuntimeException("The current system does not support UTF-8 encoding!", e);
     }
   }

   // Do NOT declare default constructor
   // (MapRed creates it reflectively)

   public void map(Object key, Object value, OutputCollector output, Reporter reporter) throws IOException {
     if (outerrThreadsThrowable != null) {
       mapRedFinished();
       throw new IOException("MROutput/MRErrThread failed:",
           outerrThreadsThrowable);
     }
     try {
       // 1/4 Hadoop in
       numRecRead_++;
       maybeLogRecord();

       // 2/4 Hadoop to Tool
       if (numExceptions_ == 0) {
         if (!this.ignoreKey) {
           inWriter_.writeKey(key);
         }
         inWriter_.writeValue(value);
         if(skipping) {
           //flush the streams on every record input if running in skip mode
           //so that we don't buffer other records surrounding a bad record.
           clientOut_.flush();
         }
       } else {
         numRecSkipped_++;
       }
     } catch (IOException io) {
       numExceptions_++;
       if (numExceptions_ > 1 || numRecWritten_ < minRecWrittenToEnableSkip_) {
         // terminate with failure
         LOG.info(getContext() , io);
         mapRedFinished();
         throw io;
       } else {
         // terminate with success:
         // swallow input records although the stream processor failed/closed
       }
     }
   }

   public void close() {
     mapRedFinished();
   }

   @Override
   public byte[] getInputSeparator() {
     return mapInputFieldSeparator;
   }

   @Override
   public byte[] getFieldSeparator() {
     return mapOutputFieldSeparator;
   }

   @Override
   public int getNumOfKeyFields() {
     return numOfMapOutputKeyFields;
   }

   @Override
   InputWriter createInputWriter() throws IOException {
     return super.createInputWriter(mapInputWriterClass_);
   }

   @Override
   OutputReader createOutputReader() throws IOException {
     return super.createOutputReader(mapOutputReaderClass_);
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.streaming;

	import java.io.*;
	import java.net.URLDecoder;

	import org.apache.hadoop.mapred.JobConf;
	import org.apache.hadoop.mapred.Mapper;
	import org.apache.hadoop.mapred.Reporter;
	import org.apache.hadoop.mapred.OutputCollector;
	import org.apache.hadoop.mapred.SkipBadRecords;
	import org.apache.hadoop.mapred.TextInputFormat;
	import org.apache.hadoop.mapreduce.MRJobConfig;
	import org.apache.hadoop.streaming.io.InputWriter;
	import org.apache.hadoop.streaming.io.OutputReader;
	import org.apache.hadoop.streaming.io.TextInputWriter;

	/** A generic Mapper bridge.
	* It delegates operations to an external program via stdin and stdout.
	*/
	public class PipeMapper extends PipeMapRed implements Mapper {

	private boolean ignoreKey = false;
	private boolean skipping = false;

	private byte[] mapOutputFieldSeparator;
	private byte[] mapInputFieldSeparator;
	private int numOfMapOutputKeyFields = 1;

	String getPipeCommand(JobConf job) {
	String str = job.get("stream.map.streamprocessor");
	if (str == null) {
	return str;
	}
	try {
	return URLDecoder.decode(str, "UTF-8");
	}
	catch (UnsupportedEncodingException e) {
	System.err.println("stream.map.streamprocessor in jobconf not found");
	return null;
	}
	}

	boolean getDoPipe() {
	return true;
	}

	public void configure(JobConf job) {
	super.configure(job);
	//disable the auto increment of the counter. For streaming, no of
	//processed records could be different(equal or less) than the no of
	//records input.
	SkipBadRecords.setAutoIncrMapperProcCount(job, false);
	skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false);
	if (mapInputWriterClass_.getCanonicalName().equals(TextInputWriter.class.getCanonicalName())) {
	String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName();
	ignoreKey = job.getBoolean("stream.map.input.ignoreKey",
	inputFormatClassName.equals(TextInputFormat.class.getCanonicalName()));
	}

	try {
	mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8");
	mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8");
	numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1);
	} catch (UnsupportedEncodingException e) {
	throw new RuntimeException("The current system does not support UTF-8 encoding!", e);
	}
	}

	// Do NOT declare default constructor
	// (MapRed creates it reflectively)

	public void map(Object key, Object value, OutputCollector output, Reporter reporter) throws IOException {
	if (outerrThreadsThrowable != null) {
	mapRedFinished();
	throw new IOException("MROutput/MRErrThread failed:",
	outerrThreadsThrowable);
	}
	try {
	// 1/4 Hadoop in
	numRecRead_++;
	maybeLogRecord();

	// 2/4 Hadoop to Tool
	if (numExceptions_ == 0) {
	if (!this.ignoreKey) {
	inWriter_.writeKey(key);
	}
	inWriter_.writeValue(value);
	if(skipping) {
	//flush the streams on every record input if running in skip mode
	//so that we don't buffer other records surrounding a bad record.
	clientOut_.flush();
	}
	} else {
	numRecSkipped_++;
	}
	} catch (IOException io) {
	numExceptions_++;
	if (numExceptions_ > 1 \|\| numRecWritten_ < minRecWrittenToEnableSkip_) {
	// terminate with failure
	LOG.info(getContext() , io);
	mapRedFinished();
	throw io;
	} else {
	// terminate with success:
	// swallow input records although the stream processor failed/closed
	}
	}
	}

	public void close() {
	mapRedFinished();
	}

	@Override
	public byte[] getInputSeparator() {
	return mapInputFieldSeparator;
	}

	@Override
	public byte[] getFieldSeparator() {
	return mapOutputFieldSeparator;
	}

	@Override
	public int getNumOfKeyFields() {
	return numOfMapOutputKeyFields;
	}

	@Override
	InputWriter createInputWriter() throws IOException {
	return super.createInputWriter(mapInputWriterClass_);
	}

	@Override
	OutputReader createOutputReader() throws IOException {
	return super.createOutputReader(mapOutputReaderClass_);
	}

	}