src/org/apache/pig/backend/hadoop/streaming/HadoopExecutableManager.java - pig - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pig.backend.hadoop.streaming;

 import java.io.File;
 import java.io.IOException;
 import java.text.NumberFormat;
 import java.util.Date;
 import java.util.List;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.MapContext;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.pig.PigException;
 import org.apache.pig.backend.executionengine.ExecException;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
 import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
 import org.apache.pig.impl.streaming.ExecutableManager;
 import org.apache.pig.impl.streaming.StreamingCommand.Handle;
 import org.apache.pig.impl.streaming.StreamingCommand.HandleSpec;

 /**
  * {@link HadoopExecutableManager} is a specialization of
  * {@link ExecutableManager} and provides HDFS-specific support for secondary
  * outputs, task-logs etc.
  *
  * <code>HadoopExecutableManager</code> provides support for  secondary outputs
  * of the managed process and also persists the logs of the tasks on HDFS.
  */
 public class HadoopExecutableManager extends ExecutableManager {
     // The part-<partition> file name, similar to Hadoop's outputs
     private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
     static {
       NUMBER_FORMAT.setMinimumIntegerDigits(5);
       NUMBER_FORMAT.setGroupingUsed(false);
     }

     static String getOutputName(int partition) {
       return "part-" + NUMBER_FORMAT.format(partition);
     }

     Configuration job;

     String scriptOutputDir;
     String scriptLogDir;
     String taskId;

     FSDataOutputStream errorStream;

     public HadoopExecutableManager() {}

     public void configure(POStream stream)
     throws IOException, ExecException {
         super.configure(stream);

         // Chmod +x the executable
         File executable = new File(command.getExecutable());
         if (executable.isAbsolute()) {
             // we don't own it. Hope it is executable ...
         } else {
             try {
                 FileUtil.chmod(executable.toString(), "a+x");
             } catch (InterruptedException ie) {
                 int errCode = 6013;
                 String msg = "Unable to chmod " + executable + " . Thread interrupted.";
                 throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ie);
             }
         }

         // Save a copy of the JobConf
         job = PigMapReduce.sJobConfInternal.get();

         // Save the output directory for the Pig Script
         scriptOutputDir = job.get("pig.streaming.task.output.dir");
         scriptLogDir = job.get("pig.streaming.log.dir", "_logs");

         // Save the taskid
         // TODO Get an equivalent property in Tez mode (currently this returns null)
         taskId = job.get(MRConfiguration.TASK_ID);
     }

     protected void exec() throws IOException {
         // Create the HDFS file for the stderr of the task, if necessary
         if (writeErrorToHDFS(command.getLogFilesLimit(), taskId)) {
             try {
                 Path errorFile =
                     new Path(new Path(scriptLogDir, command.getLogDir()), taskId);
                 errorStream =
                     errorFile.getFileSystem(job).create(errorFile);
             } catch (IOException ie) {
                 // Don't fail the task if we couldn't save it's stderr on HDFS
                 System.err.println("Failed to create stderr file of task: " +
                                    taskId + " in HDFS at " + scriptLogDir +
                                    " with " + ie);
                 errorStream = null;
             }
         }

         // Header for stderr file of the task
         writeDebugHeader();

         // Exec the command ...
         super.exec();
     }

     public void close() throws IOException {
         try {
             super.close();

             // Copy the secondary outputs of the task to HDFS
             if (this.scriptOutputDir==null) {
                 return;
             }
             Path scriptOutputDir = new Path(this.scriptOutputDir);
             FileSystem fs = scriptOutputDir.getFileSystem(job);
             List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
             if (outputSpecs != null) {
                 for (int i=1; i < outputSpecs.size(); ++i) {
                     String fileName = outputSpecs.get(i).getName();
                     try {
                         int partition = job.getInt(MRConfiguration.TASK_PARTITION, -1);
                         Path dst = new Path(new Path(scriptOutputDir, fileName), getOutputName(partition));
                         fs.copyFromLocalFile(false, true, new Path(fileName), dst);
                         fs.setReplication(dst, (short)job.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
                     } catch (IOException ioe) {
                         int errCode = 6014;
                         String msg = "Failed to save secondary output '" +
                         fileName + "' of task: " + taskId;
                         throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ioe);
                     }
                 }
         }
         } finally {
             // Footer for stderr file of the task
             writeDebugFooter();

             // Close the stderr file on HDFS
             if (errorStream != null) {
                 errorStream.close();
             }
         }
     }

     /**
      * Should the stderr data of this task be persisted on HDFS?
      *
      * @param limit maximum number of tasks whose stderr log-files are persisted
      * @param taskId id of the task
      * @return <code>true</code> if stderr data of task should be persisted on
      *         HDFS, <code>false</code> otherwise
      */
     protected boolean writeErrorToHDFS(int limit, String taskId) {
         if (command.getPersistStderr() && taskId != null) {
             int tipId = TaskAttemptID.forName(taskId).getTaskID().getId();
             return tipId < command.getLogFilesLimit();
         }
         return false;
     }

     protected void processError(String error) {
         super.processError(error);

         try {
             if (errorStream != null) {
                 errorStream.writeBytes(error);
             }
         } catch (IOException ioe) {
             super.processError("Failed to save error logs to HDFS with: " +
                                ioe);
         }
     }

     private void writeDebugHeader() {
         processError("===== Task Information Header =====" );

         processError("\nCommand: " + command);
         processError("\nStart time: " + new Date(System.currentTimeMillis()));
         if (job.getBoolean(MRConfiguration.TASK_IS_MAP, false)) {
             MapContext context = (MapContext)PigMapReduce.sJobContext;
             PigSplit pigSplit = (PigSplit)context.getInputSplit();
             int numPaths = pigSplit.getNumPaths();
             processError("\nPigSplit contains " + numPaths + " wrappedSplits.");

             StringBuilder sb = new StringBuilder();
             for(int i = 0; i < numPaths; i++) {
               InputSplit wrappedSplit = pigSplit.getWrappedSplit(i);
               if (wrappedSplit instanceof FileSplit) {
                   FileSplit mapInputFileSplit = (FileSplit)wrappedSplit;
                   sb.append("\nInput-split: file=");
                   sb.append(mapInputFileSplit.getPath());
                   sb.append(" start-offset=");
                   sb.append(Long.toString(mapInputFileSplit.getStart()));
                   sb.append(" length=");
                   sb.append(Long.toString(mapInputFileSplit.getLength()));
                   processError(sb.toString());
                   sb.setLength(0);
               }
             }
         }
         processError("\n=====          * * *          =====\n");
     }

     private void writeDebugFooter() {
         processError("===== Task Information Footer =====");

         processError("\nEnd time: " + new Date(System.currentTimeMillis()));
         processError("\nExit code: " + exitCode);

         List<HandleSpec> inputSpecs = command.getHandleSpecs(Handle.INPUT);
         HandleSpec inputSpec =
             (inputSpecs != null) ? inputSpecs.get(0) : null;
         if (inputSpec == null ||
             !inputSpec.getSpec().contains("BinaryStorage")) {
             processError("\nInput records: " + inputRecords);
         }
         processError("\nInput bytes: " + inputBytes + " bytes " +
                     ((inputSpec != null) ?
                             "(" + inputSpec.getName() + " using " +
                                 inputSpec.getSpec() + ")"
                             : ""));

         List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
         HandleSpec outputSpec =
             (outputSpecs != null) ? outputSpecs.get(0) : null;
         if (outputSpec == null ||
             !outputSpec.getSpec().contains("BinaryStorage")) {
             processError("\nOutput records: " + outputRecords);
         }
         processError("\nOutput bytes: " + outputBytes + " bytes " +
                      ((outputSpec != null) ?
                          "(" + outputSpec.getName() + " using " +
                              outputSpec.getSpec() + ")"
                          : ""));
         if (outputSpecs != null) {
             for (int i=1; i < outputSpecs.size(); ++i) {
                 HandleSpec spec = outputSpecs.get(i);
                 processError("\n           " + new File(spec.getName()).length()
                              + " bytes using " + spec.getSpec());
             }
         }

         processError("\n=====          * * *          =====\n");
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pig.backend.hadoop.streaming;

	import java.io.File;
	import java.io.IOException;
	import java.text.NumberFormat;
	import java.util.Date;
	import java.util.List;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.FileUtil;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.mapreduce.TaskAttemptID;
	import org.apache.hadoop.mapreduce.MapContext;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.pig.PigException;
	import org.apache.pig.backend.executionengine.ExecException;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
	import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStream;
	import org.apache.pig.impl.streaming.ExecutableManager;
	import org.apache.pig.impl.streaming.StreamingCommand.Handle;
	import org.apache.pig.impl.streaming.StreamingCommand.HandleSpec;

	/**
	* {@link HadoopExecutableManager} is a specialization of
	* {@link ExecutableManager} and provides HDFS-specific support for secondary
	* outputs, task-logs etc.
	*
	* <code>HadoopExecutableManager</code> provides support for secondary outputs
	* of the managed process and also persists the logs of the tasks on HDFS.
	*/
	public class HadoopExecutableManager extends ExecutableManager {
	// The part-<partition> file name, similar to Hadoop's outputs
	private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
	static {
	NUMBER_FORMAT.setMinimumIntegerDigits(5);
	NUMBER_FORMAT.setGroupingUsed(false);
	}

	static String getOutputName(int partition) {
	return "part-" + NUMBER_FORMAT.format(partition);
	}

	Configuration job;

	String scriptOutputDir;
	String scriptLogDir;
	String taskId;

	FSDataOutputStream errorStream;

	public HadoopExecutableManager() {}

	public void configure(POStream stream)
	throws IOException, ExecException {
	super.configure(stream);

	// Chmod +x the executable
	File executable = new File(command.getExecutable());
	if (executable.isAbsolute()) {
	// we don't own it. Hope it is executable ...
	} else {
	try {
	FileUtil.chmod(executable.toString(), "a+x");
	} catch (InterruptedException ie) {
	int errCode = 6013;
	String msg = "Unable to chmod " + executable + " . Thread interrupted.";
	throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ie);
	}
	}

	// Save a copy of the JobConf
	job = PigMapReduce.sJobConfInternal.get();

	// Save the output directory for the Pig Script
	scriptOutputDir = job.get("pig.streaming.task.output.dir");
	scriptLogDir = job.get("pig.streaming.log.dir", "_logs");

	// Save the taskid
	// TODO Get an equivalent property in Tez mode (currently this returns null)
	taskId = job.get(MRConfiguration.TASK_ID);
	}

	protected void exec() throws IOException {
	// Create the HDFS file for the stderr of the task, if necessary
	if (writeErrorToHDFS(command.getLogFilesLimit(), taskId)) {
	try {
	Path errorFile =
	new Path(new Path(scriptLogDir, command.getLogDir()), taskId);
	errorStream =
	errorFile.getFileSystem(job).create(errorFile);
	} catch (IOException ie) {
	// Don't fail the task if we couldn't save it's stderr on HDFS
	System.err.println("Failed to create stderr file of task: " +
	taskId + " in HDFS at " + scriptLogDir +
	" with " + ie);
	errorStream = null;
	}
	}

	// Header for stderr file of the task
	writeDebugHeader();

	// Exec the command ...
	super.exec();
	}

	public void close() throws IOException {
	try {
	super.close();

	// Copy the secondary outputs of the task to HDFS
	if (this.scriptOutputDir==null) {
	return;
	}
	Path scriptOutputDir = new Path(this.scriptOutputDir);
	FileSystem fs = scriptOutputDir.getFileSystem(job);
	List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
	if (outputSpecs != null) {
	for (int i=1; i < outputSpecs.size(); ++i) {
	String fileName = outputSpecs.get(i).getName();
	try {
	int partition = job.getInt(MRConfiguration.TASK_PARTITION, -1);
	Path dst = new Path(new Path(scriptOutputDir, fileName), getOutputName(partition));
	fs.copyFromLocalFile(false, true, new Path(fileName), dst);
	fs.setReplication(dst, (short)job.getInt(MRConfiguration.SUMIT_REPLICATION, 3));
	} catch (IOException ioe) {
	int errCode = 6014;
	String msg = "Failed to save secondary output '" +
	fileName + "' of task: " + taskId;
	throw new ExecException(msg, errCode, PigException.REMOTE_ENVIRONMENT, ioe);
	}
	}
	}
	} finally {
	// Footer for stderr file of the task
	writeDebugFooter();

	// Close the stderr file on HDFS
	if (errorStream != null) {
	errorStream.close();
	}
	}
	}

	/**
	* Should the stderr data of this task be persisted on HDFS?
	*
	* @param limit maximum number of tasks whose stderr log-files are persisted
	* @param taskId id of the task
	* @return <code>true</code> if stderr data of task should be persisted on
	* HDFS, <code>false</code> otherwise
	*/
	protected boolean writeErrorToHDFS(int limit, String taskId) {
	if (command.getPersistStderr() && taskId != null) {
	int tipId = TaskAttemptID.forName(taskId).getTaskID().getId();
	return tipId < command.getLogFilesLimit();
	}
	return false;
	}

	protected void processError(String error) {
	super.processError(error);

	try {
	if (errorStream != null) {
	errorStream.writeBytes(error);
	}
	} catch (IOException ioe) {
	super.processError("Failed to save error logs to HDFS with: " +
	ioe);
	}
	}

	private void writeDebugHeader() {
	processError("===== Task Information Header =====" );

	processError("\nCommand: " + command);
	processError("\nStart time: " + new Date(System.currentTimeMillis()));
	if (job.getBoolean(MRConfiguration.TASK_IS_MAP, false)) {
	MapContext context = (MapContext)PigMapReduce.sJobContext;
	PigSplit pigSplit = (PigSplit)context.getInputSplit();
	int numPaths = pigSplit.getNumPaths();
	processError("\nPigSplit contains " + numPaths + " wrappedSplits.");

	StringBuilder sb = new StringBuilder();
	for(int i = 0; i < numPaths; i++) {
	InputSplit wrappedSplit = pigSplit.getWrappedSplit(i);
	if (wrappedSplit instanceof FileSplit) {
	FileSplit mapInputFileSplit = (FileSplit)wrappedSplit;
	sb.append("\nInput-split: file=");
	sb.append(mapInputFileSplit.getPath());
	sb.append(" start-offset=");
	sb.append(Long.toString(mapInputFileSplit.getStart()));
	sb.append(" length=");
	sb.append(Long.toString(mapInputFileSplit.getLength()));
	processError(sb.toString());
	sb.setLength(0);
	}
	}
	}
	processError("\n===== * * * =====\n");
	}

	private void writeDebugFooter() {
	processError("===== Task Information Footer =====");

	processError("\nEnd time: " + new Date(System.currentTimeMillis()));
	processError("\nExit code: " + exitCode);

	List<HandleSpec> inputSpecs = command.getHandleSpecs(Handle.INPUT);
	HandleSpec inputSpec =
	(inputSpecs != null) ? inputSpecs.get(0) : null;
	if (inputSpec == null \|\|
	!inputSpec.getSpec().contains("BinaryStorage")) {
	processError("\nInput records: " + inputRecords);
	}
	processError("\nInput bytes: " + inputBytes + " bytes " +
	((inputSpec != null) ?
	"(" + inputSpec.getName() + " using " +
	inputSpec.getSpec() + ")"
	: ""));

	List<HandleSpec> outputSpecs = command.getHandleSpecs(Handle.OUTPUT);
	HandleSpec outputSpec =
	(outputSpecs != null) ? outputSpecs.get(0) : null;
	if (outputSpec == null \|\|
	!outputSpec.getSpec().contains("BinaryStorage")) {
	processError("\nOutput records: " + outputRecords);
	}
	processError("\nOutput bytes: " + outputBytes + " bytes " +
	((outputSpec != null) ?
	"(" + outputSpec.getName() + " using " +
	outputSpec.getSpec() + ")"
	: ""));
	if (outputSpecs != null) {
	for (int i=1; i < outputSpecs.size(); ++i) {
	HandleSpec spec = outputSpecs.get(i);
	processError("\n " + new File(spec.getName()).length()
	+ " bytes using " + spec.getSpec());
	}
	}

	processError("\n===== * * * =====\n");
	}
	}