import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.index.IndexWriter
import org.apache.lucene.util.Version
import org.apache.lucene.index.IndexWriterConfig
import org.apache.lucene.document.TextField
import org.apache.lucene.document.StringField
import static org.apache.lucene.document.Field.Store.*
* Indexer: traverses a file system and indexes .txt files
* @author Jeremy Rayner <>
* based on examples in the wonderful 'Lucene in Action' book
* by Erik Hatcher and Otis Gospodnetic (
* June 25th, 2013: Updated for Lucene 4.3.1
* requires a lucene-4.3.x.jar from
if (args.size() != 2 ) {
throw new Exception("Usage: groovy -cp lucene-1.4.3.jar Indexer <index dir> <data dir>")
def indexDir = File(args[0])) // Create Lucene index in this directory
def dataDir = new File(args[1]) // Index files in this directory
def start = new Date().time
def numIndexed = index(indexDir, dataDir)
def end = new Date().time
println "Indexing $numIndexed files took ${end - start} milliseconds"
def index(indexDir, dataDir) {
if (!dataDir.exists() || ! {
throw new IOException("$dataDir does not exist or is not a directory")
def config = new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43))
def writer = new IndexWriter(indexDir, config) // Create Lucene index
dataDir.eachFileRecurse {
if ( =~ /.txt$/) { // Index .txt files only
def numIndexed = writer.numDocs()
writer.close() // Close index
return numIndexed
void indexFile(writer, f) {
if (f.hidden || !f.exists() || !f.canRead() || { return }
println "Indexing $f.canonicalPath"
def doc = new Document()
// Construct a Field that is tokenized and indexed, but is not stored in the index verbatim.
doc.add(new TextField("contents", f.newReader()))
// Construct a Field that is not tokenized, but is indexed and stored.
doc.add(new StringField("filename",f.canonicalPath, YES))
writer.addDocument(doc) // Add document to Lucene index