blob: 2709e6a029412366b0986f1df1732715e0cb466c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.index.IndexWriter
import org.apache.lucene.util.Version
import org.apache.lucene.index.IndexWriterConfig
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.document.TextField
import org.apache.lucene.document.StringField
import static org.apache.lucene.document.Field.Store.*
/**
* Indexer: traverses a file system and indexes .txt files
*
* @author Jeremy Rayner <groovy@ross-rayner.com>
* based on examples in the wonderful 'Lucene in Action' book
* by Erik Hatcher and Otis Gospodnetic (https://www.manning.com/books/lucene-in-action-second-edition)
*
* June 25th, 2013: Updated for Lucene 4.3.1
* requires a lucene-4.3.x.jar from http://lucene.apache.org
*/
if (args.size() != 2 ) {
throw new Exception("Usage: groovy -cp lucene-1.4.3.jar Indexer <index dir> <data dir>")
}
def indexDir = FSDirectory.open(new File(args[0])) // Create Lucene index in this directory
def dataDir = new File(args[1]) // Index files in this directory
def start = new Date().time
def numIndexed = index(indexDir, dataDir)
def end = new Date().time
println "Indexing $numIndexed files took ${end - start} milliseconds"
def index(indexDir, dataDir) {
if (!dataDir.exists() || !dataDir.directory) {
throw new IOException("$dataDir does not exist or is not a directory")
}
def config = new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43))
def writer = new IndexWriter(indexDir, config) // Create Lucene index
dataDir.eachFileRecurse {
if (it.name =~ /.txt$/) { // Index .txt files only
indexFile(writer,it)
}
}
def numIndexed = writer.numDocs()
writer.close() // Close index
return numIndexed
}
void indexFile(writer, f) {
if (f.hidden || !f.exists() || !f.canRead() || f.directory) { return }
println "Indexing $f.canonicalPath"
def doc = new Document()
// Construct a Field that is tokenized and indexed, but is not stored in the index verbatim.
doc.add(new TextField("contents", f.newReader()))
// Construct a Field that is not tokenized, but is indexed and stored.
doc.add(new StringField("filename",f.canonicalPath, YES))
writer.addDocument(doc) // Add document to Lucene index
}