contrib/ant/src/java/org/apache/lucene/ant/IndexTask.java - lucene-solr - Git at Google

 package org.apache.lucene.ant;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.File;
 import java.io.IOException;
 import java.text.ParseException;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.Vector;
 import java.lang.reflect.Constructor;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.analysis.StopAnalyzer;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
 import org.apache.tools.ant.BuildException;
 import org.apache.tools.ant.DynamicConfigurator;
 import org.apache.tools.ant.Project;
 import org.apache.tools.ant.Task;
 import org.apache.tools.ant.types.EnumeratedAttribute;
 import org.apache.tools.ant.types.FileSet;
 import org.apache.tools.ant.types.Resource;
 import org.apache.tools.ant.types.ResourceCollection;
 import org.apache.tools.ant.types.resources.FileResource;

 /**
  *  Ant task to index files with Lucene
  *
  */
 public class IndexTask extends Task {
   /**
    *  resources
    */
   protected Vector<ResourceCollection> rcs = new Vector<ResourceCollection>();

   /**
    *  overwrite index?
    */
   private boolean overwrite = false;

   /**
    *  index path
    */
   private File indexDir;

   /**
    *  document handler classname
    */
   private String handlerClassName =
     FileExtensionDocumentHandler.class.getName();

   /**
    *  document handler instance
    */
   private DocumentHandler handler;


   /**
    *
    */
   private String analyzerClassName =
     StandardAnalyzer.class.getName();

   /**
    *  analyzer instance
    */
   private Analyzer analyzer;

   /**
    *  Lucene merge factor
    */
   private int mergeFactor = 20;

   private HandlerConfig handlerConfig;

   private boolean useCompoundIndex = true;


   /**
    *  Creates new instance
    */
   public IndexTask() {
   }


   /**
    *  Specifies the directory where the index will be stored
    */
   public void setIndex(File indexDir) {
     this.indexDir = indexDir;
   }


   /**
    *  Sets the mergeFactor attribute of the IndexTask object
    *
    *@param  mergeFactor  The new mergeFactor value
    */
   public void setMergeFactor(int mergeFactor) {
     this.mergeFactor = mergeFactor;
   }


   /**
    *  Sets the overwrite attribute of the IndexTask object
    *
    *@param  overwrite  The new overwrite value
    */
   public void setOverwrite(boolean overwrite) {
     this.overwrite = overwrite;
   }


   /**
    * If creating a new index and this is set to true, the
    * index will be created in compound format.
    */
   public void setUseCompoundIndex(boolean useCompoundIndex) {
     this.useCompoundIndex = useCompoundIndex;
   }

   /**
    *  Sets the documentHandler attribute of the IndexTask object
    *
    *@param  classname  The new documentHandler value
    */
   public void setDocumentHandler(String classname) {
     handlerClassName = classname;
   }

   /**
    * Sets the analyzer based on the builtin Lucene analyzer types.
    *
    * TODO: Enforce analyzer and analyzerClassName to be mutually exclusive
    */
   public void setAnalyzer(AnalyzerType type) {
     analyzerClassName = type.getClassname();
   }

   public void setAnalyzerClassName(String classname) {
     analyzerClassName = classname;
   }

   /**
    *  Adds a set of files (nested fileset attribute).
    *
    *@param  set  FileSet to be added
    */
   public void addFileset(FileSet set) {
     add(set);
   }

     /**
      * Add a collection of files to copy.
      * @param res a resource collection to copy.
      * @since Ant 1.7
      */
     public void add(ResourceCollection res) {
         rcs.add(res);
     }

   /**
    * Sets custom properties for a configurable document handler.
    */
   public void addConfig(HandlerConfig config) throws BuildException {
     if (handlerConfig != null) {
       throw new BuildException("Only one config element allowed");
     }

     handlerConfig = config;
   }

   private static final Analyzer createAnalyzer(String className) throws Exception{
     final Class<? extends Analyzer> clazz = Class.forName(className).asSubclass(Analyzer.class);
     try {
       // first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore
       Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
       return cnstr.newInstance(Version.LUCENE_CURRENT);
     } catch (NoSuchMethodException nsme) {
       // otherwise use default ctor
       return clazz.newInstance();
     }
   }

   /**
    *  Begins the indexing
    *
    *@exception  BuildException  If an error occurs indexing the
    *      fileset
    */
   @Override
   public void execute() throws BuildException {

     // construct handler and analyzer dynamically
     try {
       handler = Class.forName(handlerClassName).asSubclass(DocumentHandler.class).newInstance();

       analyzer = IndexTask.createAnalyzer(analyzerClassName);
     } catch (Exception e) {
       throw new BuildException(e);
     }

     log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE);
     log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);

     if (handler instanceof ConfigurableDocumentHandler) {
       ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties());
     }

     try {
       indexDocs();
     } catch (IOException e) {
       throw new BuildException(e);
     }
   }


   /**
    * Index the fileset.
    *
    *@exception  IOException if Lucene I/O exception
    *TODO: refactor!!!!!
    */
   private void indexDocs() throws IOException {
     Date start = new Date();

     boolean create = overwrite;
     // If the index directory doesn't exist,
     // create it and force create mode
     if (indexDir.mkdirs() && !overwrite) {
       create = true;
     }

     FSDirectory dir = FSDirectory.open(indexDir);
     try {
       Searcher searcher = null;
       boolean checkLastModified = false;
       if (!create) {
         try {
           searcher = new IndexSearcher(dir, true);
           checkLastModified = true;
         } catch (IOException ioe) {
           log("IOException: " + ioe.getMessage());
           // Empty - ignore, which indicates to index all
           // documents
         }
       }

       log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE);

       IndexWriterConfig conf = new IndexWriterConfig(
           Version.LUCENE_CURRENT, analyzer).setOpenMode(
           create ? OpenMode.CREATE : OpenMode.APPEND);
       LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
       lmp.setUseCompoundFile(useCompoundIndex);
       lmp.setUseCompoundDocStore(useCompoundIndex);
       lmp.setMergeFactor(mergeFactor);
       IndexWriter writer = new IndexWriter(dir, conf);
       int totalFiles = 0;
       int totalIndexed = 0;
       int totalIgnored = 0;
       try {

         for (int i = 0; i < rcs.size(); i++) {
           ResourceCollection rc = rcs.elementAt(i);
           if (rc.isFilesystemOnly()) {
             Iterator resources = rc.iterator();
             while (resources.hasNext()) {
               Resource r = (Resource) resources.next();
               if (!r.isExists() || !(r instanceof FileResource)) {
                 continue;
               }

               totalFiles++;

               File file = ((FileResource) r).getFile();

               if (!file.exists() || !file.canRead()) {
                 throw new BuildException("File \"" +
                                          file.getAbsolutePath()
                                          + "\" does not exist or is not readable.");
               }

               boolean indexIt = true;

               if (checkLastModified) {
                 Term pathTerm =
                   new Term("path", file.getPath());
                 TermQuery query =
                   new TermQuery(pathTerm);
                 ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs;

                 // if document is found, compare the
                 // indexed last modified time with the
                 // current file
                 // - don't index if up to date
                 if (hits.length > 0) {
                   Document doc = searcher.doc(hits[0].doc);
                   String indexModified =
                     doc.get("modified").trim();
                   if (indexModified != null) {
                     long lastModified = 0;
                     try {
                       lastModified = DateTools.stringToTime(indexModified);
                     } catch (ParseException e) {
                       // if modified time is not parsable, skip
                     }
                     if (lastModified == file.lastModified()) {
                       // TODO: remove existing document
                       indexIt = false;
                     }
                   }
                 }
               }

               if (indexIt) {
                 try {
                   log("Indexing " + file.getPath(),
                       Project.MSG_VERBOSE);
                   Document doc =
                     handler.getDocument(file);

                   if (doc == null) {
                     totalIgnored++;
                   } else {
                     // Add the path of the file as a field named "path".  Use a Keyword field, so
                     // that the index stores the path, and so that the path is searchable
                     doc.add(new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));

                     // Add the last modified date of the file a field named "modified".  Use a
                     // Keyword field, so that it's searchable, but so that no attempt is made
                     // to tokenize the field into words.
                     doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED));

                     writer.addDocument(doc);
                     totalIndexed++;
                   }
                 } catch (DocumentHandlerException e) {
                   throw new BuildException(e);
                 }
               }
             }
             // for j
           }
           // if (fs != null)
         }
         // for i

         writer.optimize();
       }
         //try
       finally {
         // always make sure everything gets closed,
         // no matter how we exit.
         writer.close();
         if (searcher != null) {
           searcher.close();
         }
       }

       Date end = new Date();

       log(totalIndexed + " out of " + totalFiles + " indexed (" +
           totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
           " milliseconds");
     } finally {
       dir.close();
     }
   }

   public static class HandlerConfig implements DynamicConfigurator {
     Properties props = new Properties();

     public void setDynamicAttribute(String attributeName, String value) throws BuildException {
       props.setProperty(attributeName, value);
     }

     public Object createDynamicElement(String elementName) throws BuildException {
       throw new BuildException("Sub elements not supported");
     }

     public Properties getProperties() {
       return props;
     }
   }

  public static class AnalyzerType extends EnumeratedAttribute {
     private static Map<String,String> analyzerLookup = new HashMap<String,String>();

     static {
       analyzerLookup.put("simple", SimpleAnalyzer.class.getName());
       analyzerLookup.put("standard", StandardAnalyzer.class.getName());
       analyzerLookup.put("stop", StopAnalyzer.class.getName());
       analyzerLookup.put("whitespace", WhitespaceAnalyzer.class.getName());
     }

     /**
      * @see EnumeratedAttribute#getValues
      */
     @Override
     public String[] getValues() {
       Set<String> keys = analyzerLookup.keySet();
       return keys.toArray(new String[0]);
     }

     public String getClassname() {
       return analyzerLookup.get(getValue());
     }
   }
 }
	package org.apache.lucene.ant;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.File;
	import java.io.IOException;
	import java.text.ParseException;
	import java.util.Date;
	import java.util.HashMap;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.Properties;
	import java.util.Set;
	import java.util.Vector;
	import java.lang.reflect.Constructor;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.SimpleAnalyzer;
	import org.apache.lucene.analysis.StopAnalyzer;
	import org.apache.lucene.analysis.WhitespaceAnalyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.DateTools;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.LogMergePolicy;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.IndexWriterConfig.OpenMode;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.Searcher;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.Version;
	import org.apache.tools.ant.BuildException;
	import org.apache.tools.ant.DynamicConfigurator;
	import org.apache.tools.ant.Project;
	import org.apache.tools.ant.Task;
	import org.apache.tools.ant.types.EnumeratedAttribute;
	import org.apache.tools.ant.types.FileSet;
	import org.apache.tools.ant.types.Resource;
	import org.apache.tools.ant.types.ResourceCollection;
	import org.apache.tools.ant.types.resources.FileResource;

	/**
	* Ant task to index files with Lucene
	*
	*/
	public class IndexTask extends Task {
	/**
	* resources
	*/
	protected Vector<ResourceCollection> rcs = new Vector<ResourceCollection>();

	/**
	* overwrite index?
	*/
	private boolean overwrite = false;

	/**
	* index path
	*/
	private File indexDir;

	/**
	* document handler classname
	*/
	private String handlerClassName =
	FileExtensionDocumentHandler.class.getName();

	/**
	* document handler instance
	*/
	private DocumentHandler handler;


	/**
	*
	*/
	private String analyzerClassName =
	StandardAnalyzer.class.getName();

	/**
	* analyzer instance
	*/
	private Analyzer analyzer;

	/**
	* Lucene merge factor
	*/
	private int mergeFactor = 20;

	private HandlerConfig handlerConfig;

	private boolean useCompoundIndex = true;


	/**
	* Creates new instance
	*/
	public IndexTask() {
	}


	/**
	* Specifies the directory where the index will be stored
	*/
	public void setIndex(File indexDir) {
	this.indexDir = indexDir;
	}


	/**
	* Sets the mergeFactor attribute of the IndexTask object
	*
	*@param mergeFactor The new mergeFactor value
	*/
	public void setMergeFactor(int mergeFactor) {
	this.mergeFactor = mergeFactor;
	}


	/**
	* Sets the overwrite attribute of the IndexTask object
	*
	*@param overwrite The new overwrite value
	*/
	public void setOverwrite(boolean overwrite) {
	this.overwrite = overwrite;
	}


	/**
	* If creating a new index and this is set to true, the
	* index will be created in compound format.
	*/
	public void setUseCompoundIndex(boolean useCompoundIndex) {
	this.useCompoundIndex = useCompoundIndex;
	}

	/**
	* Sets the documentHandler attribute of the IndexTask object
	*
	*@param classname The new documentHandler value
	*/
	public void setDocumentHandler(String classname) {
	handlerClassName = classname;
	}

	/**
	* Sets the analyzer based on the builtin Lucene analyzer types.
	*
	* TODO: Enforce analyzer and analyzerClassName to be mutually exclusive
	*/
	public void setAnalyzer(AnalyzerType type) {
	analyzerClassName = type.getClassname();
	}

	public void setAnalyzerClassName(String classname) {
	analyzerClassName = classname;
	}

	/**
	* Adds a set of files (nested fileset attribute).
	*
	*@param set FileSet to be added
	*/
	public void addFileset(FileSet set) {
	add(set);
	}

	/**
	* Add a collection of files to copy.
	* @param res a resource collection to copy.
	* @since Ant 1.7
	*/
	public void add(ResourceCollection res) {
	rcs.add(res);
	}

	/**
	* Sets custom properties for a configurable document handler.
	*/
	public void addConfig(HandlerConfig config) throws BuildException {
	if (handlerConfig != null) {
	throw new BuildException("Only one config element allowed");
	}

	handlerConfig = config;
	}

	private static final Analyzer createAnalyzer(String className) throws Exception{
	final Class<? extends Analyzer> clazz = Class.forName(className).asSubclass(Analyzer.class);
	try {
	// first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore
	Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
	return cnstr.newInstance(Version.LUCENE_CURRENT);
	} catch (NoSuchMethodException nsme) {
	// otherwise use default ctor
	return clazz.newInstance();
	}
	}

	/**
	* Begins the indexing
	*
	*@exception BuildException If an error occurs indexing the
	* fileset
	*/
	@Override
	public void execute() throws BuildException {

	// construct handler and analyzer dynamically
	try {
	handler = Class.forName(handlerClassName).asSubclass(DocumentHandler.class).newInstance();

	analyzer = IndexTask.createAnalyzer(analyzerClassName);
	} catch (Exception e) {
	throw new BuildException(e);
	}

	log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE);
	log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);

	if (handler instanceof ConfigurableDocumentHandler) {
	((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties());
	}

	try {
	indexDocs();
	} catch (IOException e) {
	throw new BuildException(e);
	}
	}


	/**
	* Index the fileset.
	*
	*@exception IOException if Lucene I/O exception
	*TODO: refactor!!!!!
	*/
	private void indexDocs() throws IOException {
	Date start = new Date();

	boolean create = overwrite;
	// If the index directory doesn't exist,
	// create it and force create mode
	if (indexDir.mkdirs() && !overwrite) {
	create = true;
	}

	FSDirectory dir = FSDirectory.open(indexDir);
	try {
	Searcher searcher = null;
	boolean checkLastModified = false;
	if (!create) {
	try {
	searcher = new IndexSearcher(dir, true);
	checkLastModified = true;
	} catch (IOException ioe) {
	log("IOException: " + ioe.getMessage());
	// Empty - ignore, which indicates to index all
	// documents
	}
	}

	log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE);

	IndexWriterConfig conf = new IndexWriterConfig(
	Version.LUCENE_CURRENT, analyzer).setOpenMode(
	create ? OpenMode.CREATE : OpenMode.APPEND);
	LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
	lmp.setUseCompoundFile(useCompoundIndex);
	lmp.setUseCompoundDocStore(useCompoundIndex);
	lmp.setMergeFactor(mergeFactor);
	IndexWriter writer = new IndexWriter(dir, conf);
	int totalFiles = 0;
	int totalIndexed = 0;
	int totalIgnored = 0;
	try {

	for (int i = 0; i < rcs.size(); i++) {
	ResourceCollection rc = rcs.elementAt(i);
	if (rc.isFilesystemOnly()) {
	Iterator resources = rc.iterator();
	while (resources.hasNext()) {
	Resource r = (Resource) resources.next();
	if (!r.isExists() \|\| !(r instanceof FileResource)) {
	continue;
	}

	totalFiles++;

	File file = ((FileResource) r).getFile();

	if (!file.exists() \|\| !file.canRead()) {
	throw new BuildException("File \"" +
	file.getAbsolutePath()
	+ "\" does not exist or is not readable.");
	}

	boolean indexIt = true;

	if (checkLastModified) {
	Term pathTerm =
	new Term("path", file.getPath());
	TermQuery query =
	new TermQuery(pathTerm);
	ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs;

	// if document is found, compare the
	// indexed last modified time with the
	// current file
	// - don't index if up to date
	if (hits.length > 0) {
	Document doc = searcher.doc(hits[0].doc);
	String indexModified =
	doc.get("modified").trim();
	if (indexModified != null) {
	long lastModified = 0;
	try {
	lastModified = DateTools.stringToTime(indexModified);
	} catch (ParseException e) {
	// if modified time is not parsable, skip
	}
	if (lastModified == file.lastModified()) {
	// TODO: remove existing document
	indexIt = false;
	}
	}
	}
	}

	if (indexIt) {
	try {
	log("Indexing " + file.getPath(),
	Project.MSG_VERBOSE);
	Document doc =
	handler.getDocument(file);

	if (doc == null) {
	totalIgnored++;
	} else {
	// Add the path of the file as a field named "path". Use a Keyword field, so
	// that the index stores the path, and so that the path is searchable
	doc.add(new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));

	// Add the last modified date of the file a field named "modified". Use a
	// Keyword field, so that it's searchable, but so that no attempt is made
	// to tokenize the field into words.
	doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED));

	writer.addDocument(doc);
	totalIndexed++;
	}
	} catch (DocumentHandlerException e) {
	throw new BuildException(e);
	}
	}
	}
	// for j
	}
	// if (fs != null)
	}
	// for i

	writer.optimize();
	}
	//try
	finally {
	// always make sure everything gets closed,
	// no matter how we exit.
	writer.close();
	if (searcher != null) {
	searcher.close();
	}
	}

	Date end = new Date();

	log(totalIndexed + " out of " + totalFiles + " indexed (" +
	totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
	" milliseconds");
	} finally {
	dir.close();
	}
	}

	public static class HandlerConfig implements DynamicConfigurator {
	Properties props = new Properties();

	public void setDynamicAttribute(String attributeName, String value) throws BuildException {
	props.setProperty(attributeName, value);
	}

	public Object createDynamicElement(String elementName) throws BuildException {
	throw new BuildException("Sub elements not supported");
	}

	public Properties getProperties() {
	return props;
	}
	}

	public static class AnalyzerType extends EnumeratedAttribute {
	private static Map<String,String> analyzerLookup = new HashMap<String,String>();

	static {
	analyzerLookup.put("simple", SimpleAnalyzer.class.getName());
	analyzerLookup.put("standard", StandardAnalyzer.class.getName());
	analyzerLookup.put("stop", StopAnalyzer.class.getName());
	analyzerLookup.put("whitespace", WhitespaceAnalyzer.class.getName());
	}

	/**
	* @see EnumeratedAttribute#getValues
	*/
	@Override
	public String[] getValues() {
	Set<String> keys = analyzerLookup.keySet();
	return keys.toArray(new String[0]);
	}

	public String getClassname() {
	return analyzerLookup.get(getValue());
	}
	}
	}