groovy-core/src/examples/searchEngine/Indexer.groovy - groovy - Git at Google

 import org.apache.lucene.analysis.standard.StandardAnalyzer
 import org.apache.lucene.document.Document
 import org.apache.lucene.document.Field
 import org.apache.lucene.index.IndexWriter

 /**
  * Indexer: traverses a file system and indexes .txt files
  *
  * @author Jeremy Rayner <groovy@ross-rayner.com>
  * based on examples in the wonderful 'Lucene in Action' book
  * by Erik Hatcher and Otis Gospodnetic ( http://www.lucenebook.com )
  *
  * requires a lucene-1.x.x.jar from http://lucene.apache.org
  */

 if (args.size() != 2 ) {
     throw new Exception("Usage: groovy -cp lucene-1.4.3.jar Indexer <index dir> <data dir>")
 }
 def indexDir = new File(args[0]) // Create Lucene index in this directory
 def dataDir = new File(args[1]) // Index files in this directory

 def start = new Date().time
 def numIndexed = index(indexDir, dataDir)
 def end = new Date().time

 println "Indexing $numIndexed files took ${end - start} milliseconds"

 def index(indexDir, dataDir) {
     if (!dataDir.exists() || !dataDir.directory) {
         throw new IOException("$dataDir does not exist or is not a directory")
     }
     def writer = new IndexWriter(indexDir, new StandardAnalyzer(), true) // Create Lucene index
     writer.useCompoundFile = false

     dataDir.eachFileRecurse {
         if (it.name =~ /.txt$/) { // Index .txt files only
             indexFile(writer,it)
         }
     }
     def numIndexed = writer.docCount()
     writer.optimize()
     writer.close() // Close index
     return numIndexed
 }

 void indexFile(writer, f) {
     if (f.hidden || !f.exists() || !f.canRead() || f.directory) { return }

     println "Indexing $f.canonicalPath"
     def doc = new Document()

     // Construct a Field that is tokenized and indexed, but is not stored in the index verbatim.
     doc.add(Field.Text("contents", new FileReader(f)))

     // Construct a Field that is not tokenized, but is indexed and stored.
     doc.add(Field.Keyword("filename",f.canonicalPath))

     writer.addDocument(doc) // Add document to Lucene index
 }
	import org.apache.lucene.analysis.standard.StandardAnalyzer
	import org.apache.lucene.document.Document
	import org.apache.lucene.document.Field
	import org.apache.lucene.index.IndexWriter

	/**
	* Indexer: traverses a file system and indexes .txt files
	*
	* @author Jeremy Rayner <groovy@ross-rayner.com>
	* based on examples in the wonderful 'Lucene in Action' book
	* by Erik Hatcher and Otis Gospodnetic ( http://www.lucenebook.com )
	*
	* requires a lucene-1.x.x.jar from http://lucene.apache.org
	*/

	if (args.size() != 2 ) {
	throw new Exception("Usage: groovy -cp lucene-1.4.3.jar Indexer <index dir> <data dir>")
	}
	def indexDir = new File(args[0]) // Create Lucene index in this directory
	def dataDir = new File(args[1]) // Index files in this directory

	def start = new Date().time
	def numIndexed = index(indexDir, dataDir)
	def end = new Date().time

	println "Indexing $numIndexed files took ${end - start} milliseconds"

	def index(indexDir, dataDir) {
	if (!dataDir.exists() \|\| !dataDir.directory) {
	throw new IOException("$dataDir does not exist or is not a directory")
	}
	def writer = new IndexWriter(indexDir, new StandardAnalyzer(), true) // Create Lucene index
	writer.useCompoundFile = false

	dataDir.eachFileRecurse {
	if (it.name =~ /.txt$/) { // Index .txt files only
	indexFile(writer,it)
	}
	}
	def numIndexed = writer.docCount()
	writer.optimize()
	writer.close() // Close index
	return numIndexed
	}

	void indexFile(writer, f) {
	if (f.hidden \|\| !f.exists() \|\| !f.canRead() \|\| f.directory) { return }

	println "Indexing $f.canonicalPath"
	def doc = new Document()

	// Construct a Field that is tokenized and indexed, but is not stored in the index verbatim.
	doc.add(Field.Text("contents", new FileReader(f)))

	// Construct a Field that is not tokenized, but is indexed and stored.
	doc.add(Field.Keyword("filename",f.canonicalPath))

	writer.addDocument(doc) // Add document to Lucene index
	}