blob: 7efd3cb898a89cf21666e63c4143c314c59714cd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cocoon.components.search;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLogEnabled;
import org.apache.avalon.framework.service.ServiceException;
import org.apache.avalon.framework.service.ServiceManager;
import org.apache.avalon.framework.service.Serviceable;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.components.crawler.CocoonCrawler;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.Directory;
import java.io.IOException;
import java.net.URL;
import java.util.Iterator;
/**
* A lucene indexer.
*
* <p>
* XML documents are indexed using lucene.
* Links to XML documents are supplied by
* a crawler, requesting links of documents by specifying a cocoon-view, and
* HTTP protocol.
* </p>
*
* @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
* @version CVS $Id$
*/
public class SimpleLuceneCocoonIndexerImpl extends AbstractLogEnabled
implements LuceneCocoonIndexer, Configurable, Serviceable, Disposable
{
/**
* configuration tagname for specifying the analyzer class
*/
public final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
/**
* configuration default analyzer class
*/
public final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
/**
* configuration tagname for specifying lucene's index directory
*/
public final static String DIRECTORY_CONFIG = "directory";
/**
* configuration default directory, ie. no default.
*/
public final static String DIRECTORY_DEFAULT = null;
/**
* configuration tagname for specifying lucene's merge factor.
*/
public final static String MERGE_FACTOR_CONFIG = "merge-factor";
/**
* configuration default value for
* <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00373.html">lucene's merge factor</a>.
*/
public final static int MERGE_FACTOR_DEFAULT = 10;
/**
* The service manager for looking up components used.
*/
protected ServiceManager manager = null;
/** The used lucene analyzer */
protected Analyzer analyzer;
// private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT;
/** The Lucene Merge Factor */
private int mergeFactor = MERGE_FACTOR_DEFAULT;
/**
* Sets the analyzer attribute of the SimpleLuceneCocoonIndexerImpl object
*
* @param analyzer
* The new analyzer value
*/
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Configure this component.
*
* @param conf is the configuration
* @exception ConfigurationException is thrown if configuring fails
*/
public void configure(Configuration conf) throws ConfigurationException {
Configuration child;
/* child = conf.getChild(ANALYZER_CLASSNAME_CONFIG, false);
if (child != null) {
// fix Bugzilla Bug 25277, use child.getValue
// and in all following blocks
String value = child.getValue(ANALYZER_CLASSNAME_DEFAULT);
if (value != null) {
analyzerClassnameDefault = value;
}
}
*/
child = conf.getChild(MERGE_FACTOR_CONFIG, false);
if (child != null) {
// fix Bugzilla Bug 25277, use child instead of conf
int int_value = child.getValueAsInteger(MERGE_FACTOR_DEFAULT);
mergeFactor = int_value;
}
}
/**
* Set the current <code>ServiceManager</code> instance used by this
* <code>Serviceable</code>.
*
* @param manager used by this component
* @exception ServiceException is never thrown
*/
public void service(ServiceManager manager) throws ServiceException {
this.manager = manager;
}
/**
* Dispose this component.
*/
public void dispose() { }
/**
* index content of base_url, index content of links from base_url.
*
* @param index
* the lucene store to write the index to
* @param create
* if true create, or overwrite existing index, else update
* existing index.
* @param base_url
* index content of base_url, and crawl through all its links
* recursivly.
* @exception ProcessingException
* is thrown if indexing fails
*/
public void index(Directory index, boolean create, URL base_url) throws ProcessingException {
IndexWriter writer = null;
LuceneXMLIndexer lxi = null;
CocoonCrawler cocoonCrawler = null;
try {
lxi = (LuceneXMLIndexer) manager.lookup(LuceneXMLIndexer.ROLE);
writer = new IndexWriter(index, analyzer, create);
writer.mergeFactor = this.mergeFactor;
cocoonCrawler = (CocoonCrawler) manager.lookup(CocoonCrawler.ROLE);
cocoonCrawler.crawl(base_url);
Iterator cocoonCrawlerIterator = cocoonCrawler.iterator();
while (cocoonCrawlerIterator.hasNext()) {
URL crawl_url = (URL) cocoonCrawlerIterator.next();
// result of fix Bugzilla Bug 25270, in SimpleCocoonCrawlerImpl
// check if crawl_url is null
if (crawl_url == null) {
continue;
} else if (!crawl_url.getHost().equals(base_url.getHost()) ||
crawl_url.getPort() != base_url.getPort()) {
// skip urls using different host, or port than host,
// or port of base url
if (getLogger().isDebugEnabled()) {
getLogger().debug("Skipping crawling URL " + crawl_url.toString() +
" as base_url is " + base_url.toString());
}
continue;
}
// build lucene documents from the content of the crawl_url
Iterator i = lxi.build(crawl_url).iterator();
// add all built lucene documents
while (i.hasNext()) {
writer.addDocument((Document) i.next());
}
}
// optimize it
writer.optimize();
} catch (IOException ioe) {
throw new ProcessingException("IOException in index()", ioe);
} catch (ServiceException se) {
throw new ProcessingException("Could not lookup service in index()", se);
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ioe) {
}
writer = null;
}
if (lxi != null) {
manager.release(lxi);
lxi = null;
}
if (cocoonCrawler != null) {
manager.release(cocoonCrawler);
cocoonCrawler = null;
}
}
}
/**
* A document iterator deleting "old" documents form the index.
*
* TODO: use this class before indexing, in non-creating mode.
*/
static class DocumentDeletableIterator {
private IndexReader reader;
// existing index
private TermEnum uidIter;
// document id iterator
/**
* Constructor for the DocumentDeletableIterator object
*
* @param directory
* Description of Parameter
* @exception IOException
* Description of Exception
*/
public DocumentDeletableIterator(Directory directory) throws IOException {
reader = IndexReader.open(directory);
// open existing index
uidIter = reader.terms(new Term("uid", ""));
// init uid iterator
}
/**
* Description of the Method
*
* @exception IOException
* Description of Exception
*/
public void deleteAllStaleDocuments() throws IOException {
while (uidIter.term() != null && uidIter.term().field().equals("uid")) {
reader.delete(uidIter.term());
uidIter.next();
}
}
/**
* Description of the Method
*
* @param uid
* Description of Parameter
* @exception IOException
* Description of Exception
*/
public void deleteModifiedDocuments(String uid) throws IOException {
while (documentHasBeenModified(uidIter.term(), uid)) {
reader.delete(uidIter.term());
uidIter.next();
}
if (documentHasNotBeenModified(uidIter.term(), uid)) {
uidIter.next();
}
}
/**
* Description of the Method
*
* @exception Throwable
* Description of Exception
*/
protected void finalize() throws Throwable {
super.finalize();
if (uidIter != null) {
uidIter.close();
// close uid iterator
uidIter = null;
}
if (reader != null) {
reader.close();
// close existing index
reader = null;
}
}
/**
* Description of the Method
*
* @param term
* Description of Parameter
* @return Description of the Returned Value
*/
boolean documentIsDeletable(Term term) {
return term != null && term.field().equals("uid");
}
/**
* Description of the Method
*
* @param term
* Description of Parameter
* @param uid
* Description of Parameter
* @return Description of the Returned Value
*/
boolean documentHasBeenModified(Term term, String uid) {
return documentIsDeletable(term) && term.text().compareTo(uid) < 0;
}
/**
* Description of the Method
*
* @param term
* Description of Parameter
* @param uid
* Description of Parameter
* @return Description of the Returned Value
*/
boolean documentHasNotBeenModified(Term term, String uid) {
return documentIsDeletable(term) && term.text().compareTo(uid) == 0;
}
}
}