hadoop-mapreduce/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.contrib.index.mapred;

 import java.io.IOException;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
 import org.apache.hadoop.contrib.index.lucene.LuceneUtil;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;

 /**
  * An implementation of an index updater interface which creates a Map/Reduce
  * job configuration and run the Map/Reduce job to analyze documents and update
  * Lucene instances in parallel.
  */
 public class IndexUpdater implements IIndexUpdater {
   public static final Log LOG = LogFactory.getLog(IndexUpdater.class);

   public IndexUpdater() {
   }

   /* (non-Javadoc)
    * @see org.apache.hadoop.contrib.index.mapred.IIndexUpdater#run(org.apache.hadoop.conf.Configuration, org.apache.hadoop.fs.Path[], org.apache.hadoop.fs.Path, int, org.apache.hadoop.contrib.index.mapred.Shard[])
    */
   public void run(Configuration conf, Path[] inputPaths, Path outputPath,
       int numMapTasks, Shard[] shards) throws IOException {
     JobConf jobConf =
         createJob(conf, inputPaths, outputPath, numMapTasks, shards);
     JobClient.runJob(jobConf);
   }

   JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
       int numMapTasks, Shard[] shards) throws IOException {
     // set the starting generation for each shard
     // when a reduce task fails, a new reduce task
     // has to know where to re-start
     setShardGeneration(conf, shards);

     // iconf.set sets properties in conf
     IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
     Shard.setIndexShards(iconf, shards);

     // MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
     // (max buffer size = 1/2 * JobContext.IO_SORT_MB).
     // Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
     // build an intermediate form/index in Combiner.
     iconf.setIOSortMB(iconf.getIOSortMB() / 2);

     // create the job configuration
     JobConf jobConf = new JobConf(conf, IndexUpdater.class);
     jobConf.setJobName(this.getClass().getName() + "_"
         + System.currentTimeMillis());

     // provided by application
     FileInputFormat.setInputPaths(jobConf, inputPaths);
     FileOutputFormat.setOutputPath(jobConf, outputPath);

     jobConf.setNumMapTasks(numMapTasks);

     // already set shards
     jobConf.setNumReduceTasks(shards.length);

     jobConf.setInputFormat(iconf.getIndexInputFormatClass());

     Path[] inputs = FileInputFormat.getInputPaths(jobConf);
     StringBuilder buffer = new StringBuilder(inputs[0].toString());
     for (int i = 1; i < inputs.length; i++) {
       buffer.append(",");
       buffer.append(inputs[i].toString());
     }
     LOG.info("mapred.input.dir = " + buffer.toString());
     LOG.info("mapreduce.output.fileoutputformat.outputdir = " +
              FileOutputFormat.getOutputPath(jobConf).toString());
     LOG.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
     LOG.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
     LOG.info(shards.length + " shards = " + iconf.getIndexShards());
     // better if we don't create the input format instance
     LOG.info("mapred.input.format.class = "
         + jobConf.getInputFormat().getClass().getName());

     // set by the system
     jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
     jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
     jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
     jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

     jobConf.setMapperClass(IndexUpdateMapper.class);
     jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
     jobConf.setCombinerClass(IndexUpdateCombiner.class);
     jobConf.setReducerClass(IndexUpdateReducer.class);

     jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

     return jobConf;
   }

   void setShardGeneration(Configuration conf, Shard[] shards)
       throws IOException {
     FileSystem fs = FileSystem.get(conf);

     for (int i = 0; i < shards.length; i++) {
       Path path = new Path(shards[i].getDirectory());
       long generation = -1;

       if (fs.exists(path)) {
         FileSystemDirectory dir = null;

         try {
           dir = new FileSystemDirectory(fs, path, false, conf);
           generation = LuceneUtil.getCurrentSegmentGeneration(dir);
         } finally {
           if (dir != null) {
             dir.close();
           }
         }
       }

       if (generation != shards[i].getGeneration()) {
         // set the starting generation for the shard
         shards[i] =
             new Shard(shards[i].getVersion(), shards[i].getDirectory(),
                 generation);
       }
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.contrib.index.mapred;

	import java.io.IOException;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
	import org.apache.hadoop.contrib.index.lucene.LuceneUtil;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.mapred.FileInputFormat;
	import org.apache.hadoop.mapred.FileOutputFormat;
	import org.apache.hadoop.mapred.JobClient;
	import org.apache.hadoop.mapred.JobConf;

	/**
	* An implementation of an index updater interface which creates a Map/Reduce
	* job configuration and run the Map/Reduce job to analyze documents and update
	* Lucene instances in parallel.
	*/
	public class IndexUpdater implements IIndexUpdater {
	public static final Log LOG = LogFactory.getLog(IndexUpdater.class);

	public IndexUpdater() {
	}

	/* (non-Javadoc)
	* @see org.apache.hadoop.contrib.index.mapred.IIndexUpdater#run(org.apache.hadoop.conf.Configuration, org.apache.hadoop.fs.Path[], org.apache.hadoop.fs.Path, int, org.apache.hadoop.contrib.index.mapred.Shard[])
	*/
	public void run(Configuration conf, Path[] inputPaths, Path outputPath,
	int numMapTasks, Shard[] shards) throws IOException {
	JobConf jobConf =
	createJob(conf, inputPaths, outputPath, numMapTasks, shards);
	JobClient.runJob(jobConf);
	}

	JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
	int numMapTasks, Shard[] shards) throws IOException {
	// set the starting generation for each shard
	// when a reduce task fails, a new reduce task
	// has to know where to re-start
	setShardGeneration(conf, shards);

	// iconf.set sets properties in conf
	IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
	Shard.setIndexShards(iconf, shards);

	// MapTask.MapOutputBuffer uses JobContext.IO_SORT_MB to decide its max buffer size
	// (max buffer size = 1/2 * JobContext.IO_SORT_MB).
	// Here we half-en JobContext.IO_SORT_MB because we use the other half memory to
	// build an intermediate form/index in Combiner.
	iconf.setIOSortMB(iconf.getIOSortMB() / 2);

	// create the job configuration
	JobConf jobConf = new JobConf(conf, IndexUpdater.class);
	jobConf.setJobName(this.getClass().getName() + "_"
	+ System.currentTimeMillis());

	// provided by application
	FileInputFormat.setInputPaths(jobConf, inputPaths);
	FileOutputFormat.setOutputPath(jobConf, outputPath);

	jobConf.setNumMapTasks(numMapTasks);

	// already set shards
	jobConf.setNumReduceTasks(shards.length);

	jobConf.setInputFormat(iconf.getIndexInputFormatClass());

	Path[] inputs = FileInputFormat.getInputPaths(jobConf);
	StringBuilder buffer = new StringBuilder(inputs[0].toString());
	for (int i = 1; i < inputs.length; i++) {
	buffer.append(",");
	buffer.append(inputs[i].toString());
	}
	LOG.info("mapred.input.dir = " + buffer.toString());
	LOG.info("mapreduce.output.fileoutputformat.outputdir = " +
	FileOutputFormat.getOutputPath(jobConf).toString());
	LOG.info("mapreduce.job.maps = " + jobConf.getNumMapTasks());
	LOG.info("mapreduce.job.reduces = " + jobConf.getNumReduceTasks());
	LOG.info(shards.length + " shards = " + iconf.getIndexShards());
	// better if we don't create the input format instance
	LOG.info("mapred.input.format.class = "
	+ jobConf.getInputFormat().getClass().getName());

	// set by the system
	jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
	jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
	jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
	jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

	jobConf.setMapperClass(IndexUpdateMapper.class);
	jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
	jobConf.setCombinerClass(IndexUpdateCombiner.class);
	jobConf.setReducerClass(IndexUpdateReducer.class);

	jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

	return jobConf;
	}

	void setShardGeneration(Configuration conf, Shard[] shards)
	throws IOException {
	FileSystem fs = FileSystem.get(conf);

	for (int i = 0; i < shards.length; i++) {
	Path path = new Path(shards[i].getDirectory());
	long generation = -1;

	if (fs.exists(path)) {
	FileSystemDirectory dir = null;

	try {
	dir = new FileSystemDirectory(fs, path, false, conf);
	generation = LuceneUtil.getCurrentSegmentGeneration(dir);
	} finally {
	if (dir != null) {
	dir.close();
	}
	}
	}

	if (generation != shards[i].getGeneration()) {
	// set the starting generation for the shard
	shards[i] =
	new Shard(shards[i].getVersion(), shards[i].getDirectory(),
	generation);
	}
	}
	}
	}