hadoop-tools/hadoop-datajoin/src/main/java/org/apache/hadoop/contrib/utils/join/DataJoinJob.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.contrib.utils.join;

 import java.io.IOException;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.RunningJob;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.hadoop.mapred.TextOutputFormat;
 import org.apache.hadoop.mapred.JobID;

 /**
  * This class implements the main function for creating a map/reduce
  * job to join data of different sources. To create sucn a job, the
  * user must implement a mapper class that extends DataJoinMapperBase class,
  * and a reducer class that extends DataJoinReducerBase.
  *
  */
 public class DataJoinJob {

   public static Class getClassByName(String className) {
     Class retv = null;
     try {
       ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
       retv = Class.forName(className, true, classLoader);
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
     return retv;
   }

   public static JobConf createDataJoinJob(String args[]) throws IOException {

     String inputDir = args[0];
     String outputDir = args[1];
     Class inputFormat = SequenceFileInputFormat.class;
     if (args[2].compareToIgnoreCase("text") != 0) {
       System.out.println("Using SequenceFileInputFormat: " + args[2]);
     } else {
       System.out.println("Using TextInputFormat: " + args[2]);
       inputFormat = TextInputFormat.class;
     }
     int numOfReducers = Integer.parseInt(args[3]);
     Class mapper = getClassByName(args[4]);
     Class reducer = getClassByName(args[5]);
     Class mapoutputValueClass = getClassByName(args[6]);
     Class outputFormat = TextOutputFormat.class;
     Class outputValueClass = Text.class;
     if (args[7].compareToIgnoreCase("text") != 0) {
       System.out.println("Using SequenceFileOutputFormat: " + args[7]);
       outputFormat = SequenceFileOutputFormat.class;
       outputValueClass = getClassByName(args[7]);
     } else {
       System.out.println("Using TextOutputFormat: " + args[7]);
     }
     long maxNumOfValuesPerGroup = 100;
     String jobName = "";
     if (args.length > 8) {
       maxNumOfValuesPerGroup = Long.parseLong(args[8]);
     }
     if (args.length > 9) {
       jobName = args[9];
     }
     Configuration defaults = new Configuration();
     JobConf job = new JobConf(defaults, DataJoinJob.class);
     job.setJobName("DataJoinJob: " + jobName);

     FileSystem fs = FileSystem.get(defaults);
     fs.delete(new Path(outputDir), true);
     FileInputFormat.setInputPaths(job, inputDir);

     job.setInputFormat(inputFormat);

     job.setMapperClass(mapper);
     FileOutputFormat.setOutputPath(job, new Path(outputDir));
     job.setOutputFormat(outputFormat);
     SequenceFileOutputFormat.setOutputCompressionType(job,
             SequenceFile.CompressionType.BLOCK);
     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(mapoutputValueClass);
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(outputValueClass);
     job.setReducerClass(reducer);

     job.setNumMapTasks(1);
     job.setNumReduceTasks(numOfReducers);
     job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
     return job;
   }

   /**
    * Submit/run a map/reduce job.
    *
    * @param job
    * @return true for success
    * @throws IOException
    */
   public static boolean runJob(JobConf job) throws IOException {
     JobClient jc = new JobClient(job);
     boolean sucess = true;
     RunningJob running = null;
     try {
       running = jc.submitJob(job);
       JobID jobId = running.getID();
       System.out.println("Job " + jobId + " is submitted");
       while (!running.isComplete()) {
         System.out.println("Job " + jobId + " is still running.");
         try {
           Thread.sleep(60000);
         } catch (InterruptedException e) {
         }
         running = jc.getJob(jobId);
       }
       sucess = running.isSuccessful();
     } finally {
       if (!sucess && (running != null)) {
         running.killJob();
       }
       jc.close();
     }
     return sucess;
   }

   /**
    * @param args
    */
   public static void main(String[] args) {
     boolean success;
     if (args.length < 8 || args.length > 10) {
       System.out.println("usage: DataJoinJob " + "inputdirs outputdir map_input_file_format "
                          + "numofParts " + "mapper_class " + "reducer_class "
                          + "map_output_value_class "
                          + "output_value_class [maxNumOfValuesPerGroup [descriptionOfJob]]]");
       System.exit(-1);
     }

     try {
       JobConf job = DataJoinJob.createDataJoinJob(args);
       success = DataJoinJob.runJob(job);
       if (!success) {
         System.out.println("Job failed");
       }
     } catch (IOException ioe) {
       ioe.printStackTrace();
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.contrib.utils.join;

	import java.io.IOException;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.SequenceFile;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapred.FileOutputFormat;
	import org.apache.hadoop.mapred.JobClient;
	import org.apache.hadoop.mapred.JobConf;
	import org.apache.hadoop.mapred.FileInputFormat;
	import org.apache.hadoop.mapred.RunningJob;
	import org.apache.hadoop.mapred.SequenceFileInputFormat;
	import org.apache.hadoop.mapred.SequenceFileOutputFormat;
	import org.apache.hadoop.mapred.TextInputFormat;
	import org.apache.hadoop.mapred.TextOutputFormat;
	import org.apache.hadoop.mapred.JobID;

	/**
	* This class implements the main function for creating a map/reduce
	* job to join data of different sources. To create sucn a job, the
	* user must implement a mapper class that extends DataJoinMapperBase class,
	* and a reducer class that extends DataJoinReducerBase.
	*
	*/
	public class DataJoinJob {

	public static Class getClassByName(String className) {
	Class retv = null;
	try {
	ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
	retv = Class.forName(className, true, classLoader);
	} catch (Exception e) {
	throw new RuntimeException(e);
	}
	return retv;
	}

	public static JobConf createDataJoinJob(String args[]) throws IOException {

	String inputDir = args[0];
	String outputDir = args[1];
	Class inputFormat = SequenceFileInputFormat.class;
	if (args[2].compareToIgnoreCase("text") != 0) {
	System.out.println("Using SequenceFileInputFormat: " + args[2]);
	} else {
	System.out.println("Using TextInputFormat: " + args[2]);
	inputFormat = TextInputFormat.class;
	}
	int numOfReducers = Integer.parseInt(args[3]);
	Class mapper = getClassByName(args[4]);
	Class reducer = getClassByName(args[5]);
	Class mapoutputValueClass = getClassByName(args[6]);
	Class outputFormat = TextOutputFormat.class;
	Class outputValueClass = Text.class;
	if (args[7].compareToIgnoreCase("text") != 0) {
	System.out.println("Using SequenceFileOutputFormat: " + args[7]);
	outputFormat = SequenceFileOutputFormat.class;
	outputValueClass = getClassByName(args[7]);
	} else {
	System.out.println("Using TextOutputFormat: " + args[7]);
	}
	long maxNumOfValuesPerGroup = 100;
	String jobName = "";
	if (args.length > 8) {
	maxNumOfValuesPerGroup = Long.parseLong(args[8]);
	}
	if (args.length > 9) {
	jobName = args[9];
	}
	Configuration defaults = new Configuration();
	JobConf job = new JobConf(defaults, DataJoinJob.class);
	job.setJobName("DataJoinJob: " + jobName);

	FileSystem fs = FileSystem.get(defaults);
	fs.delete(new Path(outputDir), true);
	FileInputFormat.setInputPaths(job, inputDir);

	job.setInputFormat(inputFormat);

	job.setMapperClass(mapper);
	FileOutputFormat.setOutputPath(job, new Path(outputDir));
	job.setOutputFormat(outputFormat);
	SequenceFileOutputFormat.setOutputCompressionType(job,
	SequenceFile.CompressionType.BLOCK);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(mapoutputValueClass);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(outputValueClass);
	job.setReducerClass(reducer);

	job.setNumMapTasks(1);
	job.setNumReduceTasks(numOfReducers);
	job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
	return job;
	}

	/**
	* Submit/run a map/reduce job.
	*
	* @param job
	* @return true for success
	* @throws IOException
	*/
	public static boolean runJob(JobConf job) throws IOException {
	JobClient jc = new JobClient(job);
	boolean sucess = true;
	RunningJob running = null;
	try {
	running = jc.submitJob(job);
	JobID jobId = running.getID();
	System.out.println("Job " + jobId + " is submitted");
	while (!running.isComplete()) {
	System.out.println("Job " + jobId + " is still running.");
	try {
	Thread.sleep(60000);
	} catch (InterruptedException e) {
	}
	running = jc.getJob(jobId);
	}
	sucess = running.isSuccessful();
	} finally {
	if (!sucess && (running != null)) {
	running.killJob();
	}
	jc.close();
	}
	return sucess;
	}

	/**
	* @param args
	*/
	public static void main(String[] args) {
	boolean success;
	if (args.length < 8 \|\| args.length > 10) {
	System.out.println("usage: DataJoinJob " + "inputdirs outputdir map_input_file_format "
	+ "numofParts " + "mapper_class " + "reducer_class "
	+ "map_output_value_class "
	+ "output_value_class [maxNumOfValuesPerGroup [descriptionOfJob]]]");
	System.exit(-1);
	}

	try {
	JobConf job = DataJoinJob.createDataJoinJob(args);
	success = DataJoinJob.runJob(job);
	if (!success) {
	System.out.println("Job failed");
	}
	} catch (IOException ioe) {
	ioe.printStackTrace();
	}
	}
	}