blob: 7c0edee436d245ad19493166f219bd5f1d8a681f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cocoon.transformation;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.Stack;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.avalon.framework.service.ServiceException;
import org.apache.avalon.framework.service.ServiceManager;
import org.apache.avalon.framework.service.Serviceable;
import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.IndexException;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.components.search.LuceneXMLIndexer;
import org.apache.cocoon.components.search.components.Indexer;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.commons.lang.BooleanUtils;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
* A lucene index creation transformer.
* <p>
* See <a
* href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer
* </a> documentation on the Cocoon Wiki.
* </p>
* <p>
* TODO: Write more documentation.
* </p>
*
* @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko </a>
* @author <a href="mailto:conal@nzetc.org">Conal Tuohy </a>
* @author Nicolas Maisonneuve
*/
public class LuceneIndexTransformerOptimized extends AbstractTransformer implements
CacheableProcessingComponent, Configurable, Contextualizable,
Serviceable {
public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
public static final String DIRECTORY_CONFIG = "directory";
public static final String DIRECTORY_PARAMETER = "directory";
public static final String DIRECTORY_DEFAULT = "index";
public static final String MERGE_FACTOR_CONFIG = "merge-factor";
public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
public static final int MERGE_FACTOR_DEFAULT = 20;
public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
public static final String LUCENE_QUERY_ELEMENT = "index";
public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
public static final String LUCENE_DOCUMENT_ELEMENT = "document";
public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
public static final String CDATA = "CDATA";
// The 3 states of the state machine
private static final int STATE_GROUND = 0; // initial or "ground" state
private static final int STATE_QUERY = 1; // processing a lucene:index
// (Query) element
private static final int STATE_DOCUMENT = 2; // processing a
// lucene:document element
// Initialization time variables
protected File workDir = null;
// service manager
private ServiceManager manager;
private Indexer indexer;
// Declaration time parameters values (specified in sitemap component
// config)
private IndexerConfiguration configureConfiguration;
// Invocation time parameters values (specified in sitemap transform
// parameters)
private IndexerConfiguration setupConfiguration;
// Parameters specified in the input document
private IndexerConfiguration queryConfiguration;
// Runtime variables
private int processing;
private boolean createIndex = false;
private StringBuffer bodyText;
private Document bodyDocument;
private String bodyDocumentURL;
private Stack elementStack = new Stack();
/**
* Storage for the document element's attributes until the document has been
* indexed, so that they can be copied to the output along with a boolean
* <code>indexed</code> attribute.
*/
private AttributesImpl documentAttributes;
private long documentStartTime;
private static String uid(String url) {
return url.replace('/', '\u0000'); // + "\u0000" +
// DateField.timeToString(urlConnection.getLastModified());
}
public void service(ServiceManager manager) throws ServiceException {
this.manager = manager;
}
/**
* Configure the transformer. The configuration parameters are stored as
* general defaults, which may be over-ridden by parameters specified as
* parameters in the sitemap pipeline, or by attributes of the query
* element(s) in the XML input document.
*/
public void configure(Configuration conf) throws ConfigurationException {
this.configureConfiguration = new IndexerConfiguration(
conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(
ANALYZER_CLASSNAME_DEFAULT), conf.getChild(
DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf
.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(
MERGE_FACTOR_DEFAULT));
}
/**
* Setup the transformer. Called when the pipeline is assembled. The
* parameters are those specified as child elements of the
* <code>&lt;map:transform&gt;</code> element in the sitemap. These
* parameters are optional: If no parameters are specified here then the
* defaults are supplied by the component configuration. Any parameters
* specified here may be over-ridden by attributes of the lucene:index
* element in the input document.
*/
public void setup(SourceResolver resolver, Map objectModel, String src,
Parameters parameters) throws ProcessingException, SAXException,
IOException {
setupConfiguration = new IndexerConfiguration(parameters.getParameter(
ANALYZER_CLASSNAME_PARAMETER,
configureConfiguration.analyzerClassname), parameters
.getParameter(DIRECTORY_PARAMETER,
configureConfiguration.indexDirectory), parameters
.getParameterAsInteger(MERGE_FACTOR_PARAMETER,
configureConfiguration.mergeFactor));
}
/**
* Contextualize this class
*/
public void contextualize(Context context) throws ContextException {
this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
}
public void recycle() {
this.processing = STATE_GROUND;
if (this.indexer != null) {
manager.release(indexer);
indexer = null;
}
this.bodyText = null;
this.bodyDocument = null;
this.bodyDocumentURL = null;
this.elementStack.clear();
super.recycle();
}
/**
* Generate the unique key. This key must be unique inside the space of this
* component.
*
* @return The generated key
*/
public Serializable getKey() {
return "1";
}
/**
* Generate the validity object.
*
* @return The generated validity object or <code>null</code> if the
* component is currently not cacheable.
*/
public SourceValidity getValidity() {
return NOPValidity.SHARED_INSTANCE;
}
public void startDocument() throws SAXException {
super.startDocument();
}
public void endDocument() throws SAXException {
super.endDocument();
}
/**
* Begin the scope of a prefix-URI Namespace mapping.
*
* @param prefix
* The Namespace prefix being declared.
* @param uri
* The Namespace URI the prefix is mapped to.
*/
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
if (processing == STATE_GROUND) {
super.startPrefixMapping(prefix, uri);
}
}
/**
* End the scope of a prefix-URI mapping.
*
* @param prefix
* The prefix that was being mapping.
*/
public void endPrefixMapping(String prefix) throws SAXException {
if (processing == STATE_GROUND) {
super.endPrefixMapping(prefix);
}
}
public void startElement(String namespaceURI, String localName,
String qName, Attributes atts) throws SAXException {
if (processing == STATE_GROUND) {
if (LUCENE_URI.equals(namespaceURI)
&& LUCENE_QUERY_ELEMENT.equals(localName)) {
String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
createIndex = BooleanUtils.toBoolean(sCreate);
String analyzerClassname = atts
.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
String indexDirectory = atts
.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
String mergeFactor = atts
.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
queryConfiguration = new IndexerConfiguration(
analyzerClassname != null ? analyzerClassname
: setupConfiguration.analyzerClassname,
indexDirectory != null ? indexDirectory
: setupConfiguration.indexDirectory,
mergeFactor != null ? Integer.parseInt(mergeFactor)
: setupConfiguration.mergeFactor);
// propagate the lucene:index to the next stage in the pipeline
super.startElement(namespaceURI, localName, qName, atts);
processing = STATE_QUERY;
} else {
super.startElement(namespaceURI, localName, qName, atts);
}
} else if (processing == STATE_QUERY) {
// processing a lucene:index - expecting a lucene:document
if (LUCENE_URI.equals(namespaceURI)
&& LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
this.bodyDocumentURL = atts
.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
if (this.bodyDocumentURL == null) {
throw new SAXException(
"<lucene:document> must have @url attribute");
}
// Remember the time the document indexing began
this.documentStartTime = System.currentTimeMillis();
// remember these attributes so they can be passed on to the
// next stage in the pipeline,
// when this document element is ended.
this.documentAttributes = new AttributesImpl(atts);
this.bodyText = new StringBuffer();
this.bodyDocument = new Document();
this.elementStack.clear();
processing = STATE_DOCUMENT;
} else {
throw new SAXException(
"<lucene:index> element can contain only <lucene:document> elements!");
}
} else if (processing == STATE_DOCUMENT) {
elementStack.push(new IndexHelperField(localName,
new AttributesImpl(atts)));
}
}
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (processing == STATE_QUERY) {
if (LUCENE_URI.equals(namespaceURI)
&& LUCENE_QUERY_ELEMENT.equals(localName)) {
// propagate the query element to the next stage in the pipeline
super.endElement(namespaceURI, localName, qName);
this.processing = STATE_GROUND;
} else {
throw new SAXException("</lucene:index> was expected!");
}
} else if (processing == STATE_DOCUMENT) {
if (LUCENE_URI.equals(namespaceURI)
&& LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
// End document processing
this.bodyDocument.add(Field.UnStored(
LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
this.bodyText = null;
this.bodyDocument.add(Field.UnIndexed(
LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
// store: false, index: true, tokenize: false
this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD,
uid(this.bodyDocumentURL), false, true, false));
try {
reindexDocument();
} catch (IndexException e) {
throw new SAXException(e);
}
this.bodyDocumentURL = null;
// propagate the lucene:document element to the next stage in
// the pipeline
long elapsedTime = System.currentTimeMillis()
- this.documentStartTime;
// documentAttributes = new AttributesImpl();
this.documentAttributes.addAttribute("",
LUCENE_ELAPSED_TIME_ATTRIBUTE,
LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String
.valueOf(elapsedTime));
super.startElement(namespaceURI, localName, qName,
this.documentAttributes);
super.endElement(namespaceURI, localName, qName);
this.processing = STATE_QUERY;
} else {
// End element processing
IndexHelperField tos = (IndexHelperField) elementStack.pop();
StringBuffer text = tos.getText();
Attributes atts = tos.getAttributes();
boolean attributesToText = atts.getIndex(LUCENE_URI,
LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
for (int i = 0; i < atts.getLength(); i++) {
// Ignore Lucene attributes
if (LUCENE_URI.equals(atts.getURI(i))) {
continue;
}
String atts_lname = atts.getLocalName(i);
String atts_value = atts.getValue(i);
bodyDocument.add(Field.UnStored(localName + "@"
+ atts_lname, atts_value));
if (attributesToText) {
text.append(atts_value);
text.append(' ');
bodyText.append(atts_value);
bodyText.append(' ');
}
}
boolean store = atts.getIndex(LUCENE_URI,
LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
if (text != null && text.length() > 0) {
if (store) {
bodyDocument
.add(Field.Text(localName, text.toString()));
} else {
bodyDocument.add(Field.UnStored(localName, text
.toString()));
}
}
}
} else {
// All other tags
super.endElement(namespaceURI, localName, qName);
}
}
public void characters(char[] ch, int start, int length)
throws SAXException {
if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0
&& length > 1 && elementStack.size() > 0) {
String text = new String(ch, start, length);
((IndexHelperField) elementStack.peek()).append(text);
bodyText.append(text);
bodyText.append(' ');
} else if (processing == STATE_GROUND) {
super.characters(ch, start, length);
}
}
private void openWriter() throws IndexException {
getLogger().debug("use luceneIndexTransformer with indexer component");
// lookup the indexer
try {
indexer = (Indexer) this.manager.lookup(Indexer.ROLE+"/default");
} catch (ServiceException e) {
throw new IndexException(e);
}
File indexDirectory = new File(queryConfiguration.indexDirectory);
if (!indexDirectory.isAbsolute()) {
indexDirectory = new File(workDir,
queryConfiguration.indexDirectory);
}
// If the index directory doesn't exist, then always create it.
boolean indexExists = IndexReader.indexExists(indexDirectory);
if (!indexExists) {
createIndex = true;
}
// Get the index directory, creating it if necessary
try {
Directory directory = LuceneCocoonHelper.getDirectory(
indexDirectory, createIndex);
indexer.setIndex(directory);
} catch (IOException e) {
throw new IndexException("set directory " + indexDirectory
+ " error", e);
}
// Get the analyzer
Analyzer analyzer = LuceneCocoonHelper
.getAnalyzer(queryConfiguration.analyzerClassname);
indexer.setAnalyzer(analyzer);
this.indexer.setMergeFactor(queryConfiguration.mergeFactor);
if (this.createIndex) {
this.indexer.clearIndex();
}
}
private void reindexDocument() throws IndexException {
// The index is being created, so there's no need to delete the doc from
// an existing index.
// This means we can keep a single IndexWriter open throughout the
// process.
if (this.indexer == null) {
openWriter();
}
this.indexer.index(this.bodyDocument);
this.bodyDocument = null;
}
class IndexHelperField {
String localName;
StringBuffer text;
Attributes attributes;
IndexHelperField(String localName, Attributes atts) {
this.localName = localName;
this.attributes = atts;
this.text = new StringBuffer();
}
public Attributes getAttributes() {
return attributes;
}
public StringBuffer getText() {
return text;
}
public void append(String text) {
this.text.append(text);
}
public void append(char[] str, int offset, int length) {
this.text.append(str, offset, length);
}
}
class IndexerConfiguration {
String analyzerClassname;
String indexDirectory;
int mergeFactor;
public IndexerConfiguration(String analyzerClassname,
String indexDirectory, int mergeFactor) {
this.analyzerClassname = analyzerClassname;
this.indexDirectory = indexDirectory;
this.mergeFactor = mergeFactor;
}
}
}