demos/src/main/java/com/datatorrent/demos/mroperator/LineIndexer.java - apex-malhar - Git at Google

 /*
  * Copyright (c) 2013 DataTorrent, Inc. ALL Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package com.datatorrent.demos.mroperator;

 import java.io.IOException;
 import java.util.Iterator;
 import java.util.StringTokenizer;

 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.MapReduceBase;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;

 /**
  * <p>LineIndexer class.</p>
  *
  * @since 0.9.0
  */
 public class LineIndexer {

   public static class LineIndexMapper extends MapReduceBase
       implements Mapper<LongWritable, Text, Text, Text> {

     private final static Text word = new Text();
     private final static Text location = new Text();

     public void map(LongWritable key, Text val,
         OutputCollector<Text, Text> output, Reporter reporter)
         throws IOException {

       FileSplit fileSplit = (FileSplit)reporter.getInputSplit();
       String fileName = fileSplit.getPath().getName();
       location.set(fileName);

       String line = val.toString();
       StringTokenizer itr = new StringTokenizer(line.toLowerCase());
       while (itr.hasMoreTokens()) {
         word.set(itr.nextToken());
         output.collect(word, location);
       }
     }
   }


   public static class LineIndexReducer extends MapReduceBase
       implements Reducer<Text, Text, Text, Text> {

     public void reduce(Text key, Iterator<Text> values,
         OutputCollector<Text, Text> output, Reporter reporter)
         throws IOException {

       boolean first = true;
       StringBuilder toReturn = new StringBuilder();
       while (values.hasNext()){
         if (!first)
           toReturn.append(", ");
         first=false;
         toReturn.append(values.next().toString());
       }

       output.collect(key, new Text(toReturn.toString()));
     }
   }


   /**
    * The actual main() method for our program; this is the
    * "driver" for the MapReduce job.
    */
   public static void main(String[] args) {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(LineIndexer.class);

     conf.setJobName("LineIndexer");

     conf.setOutputKeyClass(Text.class);
     conf.setOutputValueClass(Text.class);

     FileInputFormat.addInputPath(conf, new Path("input"));
     FileOutputFormat.setOutputPath(conf, new Path("output"));

     conf.setMapperClass(LineIndexMapper.class);
     conf.setReducerClass(LineIndexReducer.class);

     client.setConf(conf);

     try {
       JobClient.runJob(conf);
     } catch (Exception e) {
       e.printStackTrace();
     }
   }
 }
	/*
	* Copyright (c) 2013 DataTorrent, Inc. ALL Rights Reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package com.datatorrent.demos.mroperator;

	import java.io.IOException;
	import java.util.Iterator;
	import java.util.StringTokenizer;

	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapred.FileInputFormat;
	import org.apache.hadoop.mapred.FileOutputFormat;
	import org.apache.hadoop.mapred.FileSplit;
	import org.apache.hadoop.mapred.JobClient;
	import org.apache.hadoop.mapred.JobConf;
	import org.apache.hadoop.mapred.MapReduceBase;
	import org.apache.hadoop.mapred.Mapper;
	import org.apache.hadoop.mapred.OutputCollector;
	import org.apache.hadoop.mapred.Reducer;
	import org.apache.hadoop.mapred.Reporter;

	/**
	* <p>LineIndexer class.</p>
	*
	* @since 0.9.0
	*/
	public class LineIndexer {

	public static class LineIndexMapper extends MapReduceBase
	implements Mapper<LongWritable, Text, Text, Text> {

	private final static Text word = new Text();
	private final static Text location = new Text();

	public void map(LongWritable key, Text val,
	OutputCollector<Text, Text> output, Reporter reporter)
	throws IOException {

	FileSplit fileSplit = (FileSplit)reporter.getInputSplit();
	String fileName = fileSplit.getPath().getName();
	location.set(fileName);

	String line = val.toString();
	StringTokenizer itr = new StringTokenizer(line.toLowerCase());
	while (itr.hasMoreTokens()) {
	word.set(itr.nextToken());
	output.collect(word, location);
	}
	}
	}



	public static class LineIndexReducer extends MapReduceBase
	implements Reducer<Text, Text, Text, Text> {

	public void reduce(Text key, Iterator<Text> values,
	OutputCollector<Text, Text> output, Reporter reporter)
	throws IOException {

	boolean first = true;
	StringBuilder toReturn = new StringBuilder();
	while (values.hasNext()){
	if (!first)
	toReturn.append(", ");
	first=false;
	toReturn.append(values.next().toString());
	}

	output.collect(key, new Text(toReturn.toString()));
	}
	}


	/**
	* The actual main() method for our program; this is the
	* "driver" for the MapReduce job.
	*/
	public static void main(String[] args) {
	JobClient client = new JobClient();
	JobConf conf = new JobConf(LineIndexer.class);

	conf.setJobName("LineIndexer");

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(Text.class);

	FileInputFormat.addInputPath(conf, new Path("input"));
	FileOutputFormat.setOutputPath(conf, new Path("output"));

	conf.setMapperClass(LineIndexMapper.class);
	conf.setReducerClass(LineIndexReducer.class);

	client.setConf(conf);

	try {
	JobClient.runJob(conf);
	} catch (Exception e) {
	e.printStackTrace();
	}
	}
	}