blob: c28b820b327da7a01a12ebfc173402ccba596f7c [file] [log] [blame]
/*
* Copyright 1999-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cocoon.transformation;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.Stack;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.components.search.LuceneXMLIndexer;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* A lucene index creation transformer.
* <p>See <a href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer</a>
* documentation on the Cocoon Wiki.</p>
* <p>TODO: Write more documentation.</p>
*
* @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko</a>
* @author <a href="mailto:conal@nzetc.org">Conal Tuohy</a>
* @version CVS $Id: LuceneIndexTransformer.java,v 1.12 2004/03/05 13:01:59 bdelacretaz Exp $
*/
public class LuceneIndexTransformer extends AbstractTransformer
implements CacheableProcessingComponent, Configurable, Contextualizable {
public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
public static final String DIRECTORY_CONFIG = "directory";
public static final String DIRECTORY_PARAMETER = "directory";
public static final String DIRECTORY_DEFAULT = "index";
public static final String MERGE_FACTOR_CONFIG = "merge-factor";
public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
public static final int MERGE_FACTOR_DEFAULT = 20;
public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
public static final String LUCENE_QUERY_ELEMENT = "index";
public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
public static final String LUCENE_DOCUMENT_ELEMENT = "document";
public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
public static final String CDATA = "CDATA";
// The 3 states of the state machine
private static final int STATE_GROUND = 0; // initial or "ground" state
private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element
private static final int STATE_DOCUMENT = 2; // processing a lucene:document element
// Initialization time variables
protected File workDir = null;
// Declaration time parameters values (specified in sitemap component config)
private IndexerConfiguration configureConfiguration;
// Invocation time parameters values (specified in sitemap transform parameters)
private IndexerConfiguration setupConfiguration;
// Parameters specified in the input document
private IndexerConfiguration queryConfiguration;
// Runtime variables
private int processing;
private boolean createIndex = false;
private IndexWriter writer;
private StringBuffer bodyText;
private Document bodyDocument;
private String bodyDocumentURL;
private Stack elementStack = new Stack();
/**
* Storage for the document element's attributes until the document
* has been indexed, so that they can be copied to the output
* along with a boolean <code>indexed</code> attribute.
*/
private AttributesImpl documentAttributes;
private long documentStartTime;
private static String uid(String url) {
return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified());
}
/**
* Configure the transformer. The configuration parameters are stored as
* general defaults, which may be over-ridden by parameters specified as
* parameters in the sitemap pipeline, or by attributes of the query
* element(s) in the XML input document.
*/
public void configure(Configuration conf) throws ConfigurationException {
this.configureConfiguration = new IndexerConfiguration(
conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(ANALYZER_CLASSNAME_DEFAULT),
conf.getChild(DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT),
conf.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(MERGE_FACTOR_DEFAULT)
);
}
/**
* Setup the transformer.
* Called when the pipeline is assembled.
* The parameters are those specified as child elements of the
* <code>&lt;map:transform&gt;</code> element in the sitemap.
* These parameters are optional:
* If no parameters are specified here then the defaults are
* supplied by the component configuration.
* Any parameters specified here may be over-ridden by attributes
* of the lucene:index element in the input document.
*/
public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters)
throws ProcessingException, SAXException, IOException {
setupConfiguration = new IndexerConfiguration(
parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, configureConfiguration.analyzerClassname),
parameters.getParameter(DIRECTORY_PARAMETER, configureConfiguration.indexDirectory),
parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, configureConfiguration.mergeFactor)
);
}
/**
* Contextualize this class
*/
public void contextualize(Context context) throws ContextException {
this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
}
public void recycle() {
this.processing = STATE_GROUND;
if (this.writer != null) {
try { this.writer.close(); } catch (IOException ioe) { }
this.writer = null;
}
this.bodyText = null;
this.bodyDocument = null;
this.bodyDocumentURL = null;
this.elementStack.clear();
super.recycle();
}
/**
* Generate the unique key.
* This key must be unique inside the space of this component.
*
* @return The generated key
*/
public Serializable getKey() {
return "1";
}
/**
* Generate the validity object.
*
* @return The generated validity object or <code>null</code> if the
* component is currently not cacheable.
*/
public SourceValidity getValidity() {
return NOPValidity.SHARED_INSTANCE;
}
public void startDocument() throws SAXException {
super.startDocument();
}
public void endDocument() throws SAXException {
super.endDocument();
}
/**
* Begin the scope of a prefix-URI Namespace mapping.
*
* @param prefix The Namespace prefix being declared.
* @param uri The Namespace URI the prefix is mapped to.
*/
public void startPrefixMapping(String prefix, String uri) throws SAXException {
if (processing == STATE_GROUND) {
super.startPrefixMapping(prefix,uri);
}
}
/**
* End the scope of a prefix-URI mapping.
*
* @param prefix The prefix that was being mapping.
*/
public void endPrefixMapping(String prefix) throws SAXException {
if (processing == STATE_GROUND) {
super.endPrefixMapping(prefix);
}
}
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException {
if (processing == STATE_GROUND) {
if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)){
String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
createIndex = sCreate != null &&
(sCreate.equalsIgnoreCase("yes") || sCreate.equalsIgnoreCase("true"));
String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
String indexDirectory = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
String mergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
queryConfiguration = new IndexerConfiguration(
analyzerClassname != null ? analyzerClassname : setupConfiguration.analyzerClassname,
indexDirectory != null ? indexDirectory : setupConfiguration.indexDirectory,
mergeFactor != null ? Integer.parseInt(mergeFactor) : setupConfiguration.mergeFactor
);
if (!createIndex) {
// Not asked to create the index - but check if this is necessary anyway:
try {
IndexReader reader = openReader();
reader.close();
} catch (IOException ioe) {
// couldn't open the index - so recreate it
createIndex = true;
}
}
// propagate the lucene:index to the next stage in the pipeline
super.startElement(namespaceURI, localName, qName, atts);
processing = STATE_QUERY;
} else {
super.startElement(namespaceURI, localName, qName, atts);
}
} else if (processing == STATE_QUERY) {
// processing a lucene:index - expecting a lucene:document
if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)){
this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
if (this.bodyDocumentURL == null) {
throw new SAXException("<lucene:document> must have @url attribute");
}
// Remember the time the document indexing began
this.documentStartTime = System.currentTimeMillis();
// remember these attributes so they can be passed on to the next stage in the pipeline,
// when this document element is ended.
this.documentAttributes = new AttributesImpl(atts);
this.bodyText = new StringBuffer();
this.bodyDocument = new Document();
this.elementStack.clear();
processing = STATE_DOCUMENT;
} else {
throw new SAXException("<lucene:index> element can contain only <lucene:document> elements!");
}
} else if (processing == STATE_DOCUMENT) {
elementStack.push(new IndexHelperField(localName, new AttributesImpl(atts)));
}
}
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (processing == STATE_QUERY) {
if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) {
// End query processing
try {
if (this.writer == null) {
openWriter();
}
this.writer.optimize();
this.writer.close();
this.writer = null;
} catch (IOException e) {
throw new SAXException(e);
}
// propagate the query element to the next stage in the pipeline
super.endElement(namespaceURI, localName, qName);
this.processing = STATE_GROUND;
} else {
throw new SAXException("</lucene:index> was expected!");
}
} else if (processing == STATE_DOCUMENT) {
if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
// End document processing
this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
this.bodyText = null;
this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
// store: false, index: true, tokenize: false
this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false));
try {
reindexDocument();
} catch (IOException e) {
throw new SAXException(e);
}
this.bodyDocumentURL = null;
// propagate the lucene:document element to the next stage in the pipeline
long elapsedTime = System.currentTimeMillis() - this.documentStartTime;
//documentAttributes = new AttributesImpl();
this.documentAttributes.addAttribute(
"",
LUCENE_ELAPSED_TIME_ATTRIBUTE,
LUCENE_ELAPSED_TIME_ATTRIBUTE,
CDATA,
String.valueOf(elapsedTime)
);
super.startElement(namespaceURI, localName, qName, this.documentAttributes);
super.endElement(namespaceURI, localName, qName);
this.processing = STATE_QUERY;
} else {
// End element processing
IndexHelperField tos = (IndexHelperField) elementStack.pop();
StringBuffer text = tos.getText();
Attributes atts = tos.getAttributes();
boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
for (int i = 0; i < atts.getLength(); i++) {
// Ignore Lucene attributes
if (LUCENE_URI.equals(atts.getURI(i)))
continue;
String atts_lname = atts.getLocalName(i);
String atts_value = atts.getValue(i);
bodyDocument.add(Field.UnStored(localName + "@" + atts_lname, atts_value));
if (attributesToText) {
text.append(atts_value);
text.append(' ');
bodyText.append(atts_value);
bodyText.append(' ');
}
}
boolean store = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
if (text != null && text.length() > 0) {
if (store) {
bodyDocument.add(Field.Text(localName, text.toString()));
} else {
bodyDocument.add(Field.UnStored(localName, text.toString()));
}
}
}
} else {
// All other tags
super.endElement(namespaceURI, localName, qName);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) {
String text = new String(ch, start, length);
((IndexHelperField) elementStack.peek()).append(text);
bodyText.append(text);
bodyText.append(' ');
} else if (processing == STATE_GROUND) {
super.characters(ch, start, length);
}
}
private void openWriter() throws IOException {
File indexDirectory = new File(queryConfiguration.indexDirectory);
if (!indexDirectory.isAbsolute()) {
indexDirectory = new File(workDir, queryConfiguration.indexDirectory);
}
// If the index directory doesn't exist, then always create it.
boolean indexExists = IndexReader.indexExists(indexDirectory);
if (!indexExists) {
createIndex = true;
}
// Get the index directory, creating it if necessary
Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(queryConfiguration.analyzerClassname);
this.writer = new IndexWriter(directory, analyzer, createIndex);
this.writer.mergeFactor = queryConfiguration.mergeFactor;
}
private IndexReader openReader() throws IOException {
File indexDirectory = new File(queryConfiguration.indexDirectory);
if (!indexDirectory.isAbsolute()) {
indexDirectory = new File(workDir, queryConfiguration.indexDirectory);
}
Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
IndexReader reader = IndexReader.open(directory);
return reader;
}
private void reindexDocument() throws IOException {
if (this.createIndex) {
// The index is being created, so there's no need to delete the doc from an existing index.
// This means we can keep a single IndexWriter open throughout the process.
if (this.writer == null)
openWriter();
this.writer.addDocument(this.bodyDocument);
} else {
// This is an incremental reindex, so the document should be removed from the index before adding it
try {
IndexReader reader = openReader();
reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL)));
reader.close();
} catch (IOException e) { /* ignore */ }
openWriter();
this.writer.addDocument(this.bodyDocument);
this.writer.close();
this.writer = null;
}
this.bodyDocument = null;
}
class IndexHelperField
{
String localName;
StringBuffer text;
Attributes attributes;
IndexHelperField(String localName, Attributes atts) {
this.localName = localName;
this.attributes = atts;
this.text = new StringBuffer();
}
public Attributes getAttributes() {
return attributes;
}
public StringBuffer getText() {
return text;
}
public void append(String text) {
this.text.append(text);
}
public void append(char[] str, int offset, int length) {
this.text.append(str, offset, length);
}
}
class IndexerConfiguration {
String analyzerClassname;
String indexDirectory;
int mergeFactor;
public IndexerConfiguration(String analyzerClassname,
String indexDirectory,
int mergeFactor)
{
this.analyzerClassname = analyzerClassname;
this.indexDirectory = indexDirectory;
this.mergeFactor = mergeFactor;
}
}
}