RELEASE_2_1_13/src/blocks/lucene/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java - cocoon - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cocoon.components.search;

 import org.apache.avalon.framework.configuration.Configurable;
 import org.apache.avalon.framework.configuration.Configuration;
 import org.apache.avalon.framework.configuration.ConfigurationException;
 import org.apache.avalon.framework.logger.AbstractLogEnabled;
 import org.apache.avalon.framework.service.ServiceException;
 import org.apache.avalon.framework.service.ServiceManager;
 import org.apache.avalon.framework.service.Serviceable;
 import org.apache.avalon.framework.thread.ThreadSafe;
 import org.apache.cocoon.ProcessingException;
 import org.apache.commons.lang.StringUtils;
 import org.apache.excalibur.xml.sax.SAXParser;
 import org.apache.lucene.document.DateField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;

 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.net.URLConnection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;


 /**
  * A simple class building lucene documents from xml content.
  *
  * <p>It has two parameters that effect the way it works:</p>
  * <p>
  *   <tt>&lt;store-fields/&gt;</tt>
  *   Sets which tags in your content are stored in Lucene as fields,
  *   during the indexing process. Allows them to be output with search hits.
  * </p><p>
  *   <tt>&lt;content-view-query/&gt;</tt>
  *   Sets the view the indexer will request for indexing content.
  * </p><p>
  *   Example configuration (goes in cocoon.xconf)
  *   <pre><tt>
  *     &lt;lucene-xml-indexer logger="core.search.lucene"&gt;
  *       &lt;store-fields&gt;title, summary&lt;/store-fields&gt;
  *       &lt;content-view-query&gt;cocoon-view=search&lt;/content-view-query&gt;
  *     &lt;/lucene-xml-indexer&gt;
  *   </tt></pre></p>
  *
  * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
  * @author <a href="mailto:jeremy@apache.org">Jeremy Quinn</a>
  * @version CVS $Id$
  */
 public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled
          implements LuceneXMLIndexer, Configurable, Serviceable, ThreadSafe {

     /**
      * The service manager instance
      *
      * @since
      */
     protected ServiceManager manager = null;

     /**
      * Config element name specifying query-string appendend for requesting links
      * of an URL.
      * <p>
      *  Its value is <code>link-view-query</code>.
      * </p>
      *
      * @since
      */
     public final static String CONTENT_VIEW_QUERY_CONFIG = "content-view-query";

     /**
      * append this string to the url in order to get the
      * content view of the url
      *
      * @since
      */

     final static String CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content";

     /**
      * Config element name specifying the tags to be added as Stored, Untokenised, Unindexed Fields.
      * <p>
      *  Its value is <code>field-tags</code>.
      * </p>
      *
      * @since
      */
     public final static String FIELDTAGS_CONFIG = "store-fields";

     /**
      * set of allowed content types
      *
      * @since
      */
     final HashSet allowedContentType;


     /**
      * @since
      */
     public SimpleLuceneXMLIndexerImpl() {
         allowedContentType = new HashSet();
         allowedContentType.add("text/xml");
         allowedContentType.add("text/xhtml");
         fieldTags = new HashSet();
     }


     private String contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT;
     private HashSet fieldTags;


     /**
      * configure
      *
      * @param  configuration
      * @exception  ConfigurationException
      * @since
      */
     public void configure(Configuration configuration) throws ConfigurationException {

         Configuration[] children;
         children = configuration.getChildren(FIELDTAGS_CONFIG);
         if (children != null && children.length > 0) {
             fieldTags = new HashSet();
             for (int i = 0; i < children.length; i++) {
                 String pattern = children[i].getValue();
                 String params[] = StringUtils.split(pattern, ", ");
                 for (int index = 0; index < params.length; index++) {
                     String tokenized_pattern = params[index];
 					if (!tokenized_pattern.equals("")) {
 						this.fieldTags.add(tokenized_pattern);
 						if (getLogger().isDebugEnabled()) {
 								getLogger().debug("add field: " + tokenized_pattern);
 						}
 					}
     			}
             }
         } else {
             if (getLogger().isDebugEnabled()) {
                 getLogger().debug("Do not add any fields");
             }
         }
         this.contentViewQuery = configuration.getChild(CONTENT_VIEW_QUERY_CONFIG, true).getValue(CONTENT_VIEW_QUERY_DEFAULT);
 				if (getLogger().isDebugEnabled()) {
 						getLogger().debug("content view: " + this.contentViewQuery);
 				}
     }


     /**
      * Set the current <code>ServiceManager</code> instance used by this
      * <code>Serviceable</code>.
      *
      * @param  manager                 Description of Parameter
      * @exception  ServiceException  Description of Exception
      * @since
      */
     public void service(ServiceManager manager) throws ServiceException {
         this.manager = manager;
     }


     /**
      * Build lucenen documents from a URL
      *
      * @param  url                      the content of this url gets indexed.
      * @exception  ProcessingException  Description of Exception
      * @since
      */
     public List build(URL url)
              throws ProcessingException {

         try {
             URL contentURL = new URL(url, url.getFile()
                 + ((url.getFile().indexOf("?") == -1) ? "?" : "&")
                 + contentViewQuery);
             URLConnection contentURLConnection = contentURL.openConnection();
             if (contentURLConnection == null) {
                 throw new ProcessingException("Can not open connection to URL "
                         + contentURL + " (null connection)");
             }

             String contentType = contentURLConnection.getContentType();
             if (contentType == null) {
                 if (getLogger().isDebugEnabled()) {
                     getLogger().debug("Ignoring " + contentURL + " (no content type)");
                 }

                 return Collections.EMPTY_LIST;
             }

             int index = contentType.indexOf(';');
             if (index != -1) {
                 contentType = contentType.substring(0, index);
             }

             if (allowedContentType.contains(contentType)) {
                 if (getLogger().isDebugEnabled()) {
                     getLogger().debug("Indexing " + contentURL + " (" + contentType + ")");
                 }

                 LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
                 luceneIndexContentHandler.setFieldTags(fieldTags);
                 indexDocument(contentURLConnection, luceneIndexContentHandler);
                 //
                 // document is parsed
                 //
                 Iterator it = luceneIndexContentHandler.iterator();
                 while (it.hasNext()) {
                     Document d = (Document) it.next();
                     d.add(Field.UnIndexed(URL_FIELD, url.toString()));
                     // store ... false, index ... true, token ... false
                     d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false));
                 }

                 return luceneIndexContentHandler.allDocuments();
             } else {
                 if (getLogger().isDebugEnabled()) {
                     getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")");
                 }

                 return Collections.EMPTY_LIST;
             }
         } catch (IOException ioe) {
             throw new ProcessingException("Cannot read URL " + url, ioe);
         }
     }


     /**
      * index input stream producing lucene Documents
      *
      * @param  contentURLConnection       the xml content which should get indexed.
      * @param  luceneIndexContentHandler  ContentHandler for generating
      *   a lucene Document from XML content.
      * @exception  ProcessingException    Description of Exception
      * @since
      */
     private void indexDocument(URLConnection contentURLConnection,
             LuceneIndexContentHandler luceneIndexContentHandler)
              throws ProcessingException {

         InputStream is = null;
         InputSource in = null;
         SAXParser parser = null;

         try {
             is = contentURLConnection.getInputStream();
             in = new InputSource(is);

             // get an XML parser
             parser = (SAXParser) this.manager.lookup(SAXParser.ROLE);
             //reader.setErrorHandler(new CocoonErrorHandler());
             parser.parse(in, luceneIndexContentHandler);
             //
             // document is parsed
             //
         } catch (IOException ioe) {
             throw new ProcessingException("Cannot read!", ioe);
         } catch (SAXException saxe) {
             throw new ProcessingException("Cannot parse!", saxe);
         } catch (ServiceException se) {
             throw new ProcessingException("Cannot lookup xml parser!", se);
         } finally {
             if (parser != null) {
                 this.manager.release(parser);
             }
         }
     }


     /**
      * return a unique uid of a url connection
      *
      * @param  urlConnection  Description of Parameter
      * @return                String unique uid of a urlConnection
      * @since
      */
     private String uid(URLConnection urlConnection) {
         // Append path and date into a string in such a way that lexicographic
         // sorting gives the same results as a walk of the file hierarchy.  Thus
         // null (\u0000) is used both to separate directory components and to
         // separate the path from the date.
         return urlConnection.toString().replace('/', '\u0000') +
                 "\u0000" +
                 DateField.timeToString(urlConnection.getLastModified());
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.cocoon.components.search;

	import org.apache.avalon.framework.configuration.Configurable;
	import org.apache.avalon.framework.configuration.Configuration;
	import org.apache.avalon.framework.configuration.ConfigurationException;
	import org.apache.avalon.framework.logger.AbstractLogEnabled;
	import org.apache.avalon.framework.service.ServiceException;
	import org.apache.avalon.framework.service.ServiceManager;
	import org.apache.avalon.framework.service.Serviceable;
	import org.apache.avalon.framework.thread.ThreadSafe;
	import org.apache.cocoon.ProcessingException;
	import org.apache.commons.lang.StringUtils;
	import org.apache.excalibur.xml.sax.SAXParser;
	import org.apache.lucene.document.DateField;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;

	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;
	import java.net.URLConnection;
	import java.util.Collections;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;


	/**
	* A simple class building lucene documents from xml content.
	*
	* <p>It has two parameters that effect the way it works:</p>
	* <p>
	* <tt><store-fields/></tt>
	* Sets which tags in your content are stored in Lucene as fields,
	* during the indexing process. Allows them to be output with search hits.
	* </p><p>
	* <tt><content-view-query/></tt>
	* Sets the view the indexer will request for indexing content.
	* </p><p>
	* Example configuration (goes in cocoon.xconf)
	* <pre><tt>
	* <lucene-xml-indexer logger="core.search.lucene">
	* <store-fields>title, summary</store-fields>
	* <content-view-query>cocoon-view=search</content-view-query>
	* </lucene-xml-indexer>
	* </tt></pre></p>
	*
	* @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
	* @author <a href="mailto:jeremy@apache.org">Jeremy Quinn</a>
	* @version CVS $Id$
	*/
	public class SimpleLuceneXMLIndexerImpl extends AbstractLogEnabled
	implements LuceneXMLIndexer, Configurable, Serviceable, ThreadSafe {

	/**
	* The service manager instance
	*
	* @since
	*/
	protected ServiceManager manager = null;

	/**
	* Config element name specifying query-string appendend for requesting links
	* of an URL.
	* <p>
	* Its value is <code>link-view-query</code>.
	* </p>
	*
	* @since
	*/
	public final static String CONTENT_VIEW_QUERY_CONFIG = "content-view-query";

	/**
	* append this string to the url in order to get the
	* content view of the url
	*
	* @since
	*/

	final static String CONTENT_VIEW_QUERY_DEFAULT = "cocoon-view=content";

	/**
	* Config element name specifying the tags to be added as Stored, Untokenised, Unindexed Fields.
	* <p>
	* Its value is <code>field-tags</code>.
	* </p>
	*
	* @since
	*/
	public final static String FIELDTAGS_CONFIG = "store-fields";

	/**
	* set of allowed content types
	*
	* @since
	*/
	final HashSet allowedContentType;


	/**
	* @since
	*/
	public SimpleLuceneXMLIndexerImpl() {
	allowedContentType = new HashSet();
	allowedContentType.add("text/xml");
	allowedContentType.add("text/xhtml");
	fieldTags = new HashSet();
	}


	private String contentViewQuery = CONTENT_VIEW_QUERY_DEFAULT;
	private HashSet fieldTags;


	/**
	* configure
	*
	* @param configuration
	* @exception ConfigurationException
	* @since
	*/
	public void configure(Configuration configuration) throws ConfigurationException {

	Configuration[] children;
	children = configuration.getChildren(FIELDTAGS_CONFIG);
	if (children != null && children.length > 0) {
	fieldTags = new HashSet();
	for (int i = 0; i < children.length; i++) {
	String pattern = children[i].getValue();
	String params[] = StringUtils.split(pattern, ", ");
	for (int index = 0; index < params.length; index++) {
	String tokenized_pattern = params[index];
	if (!tokenized_pattern.equals("")) {
	this.fieldTags.add(tokenized_pattern);
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("add field: " + tokenized_pattern);
	}
	}
	}
	}
	} else {
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("Do not add any fields");
	}
	}
	this.contentViewQuery = configuration.getChild(CONTENT_VIEW_QUERY_CONFIG, true).getValue(CONTENT_VIEW_QUERY_DEFAULT);
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("content view: " + this.contentViewQuery);
	}
	}


	/**
	* Set the current <code>ServiceManager</code> instance used by this
	* <code>Serviceable</code>.
	*
	* @param manager Description of Parameter
	* @exception ServiceException Description of Exception
	* @since
	*/
	public void service(ServiceManager manager) throws ServiceException {
	this.manager = manager;
	}


	/**
	* Build lucenen documents from a URL
	*
	* @param url the content of this url gets indexed.
	* @exception ProcessingException Description of Exception
	* @since
	*/
	public List build(URL url)
	throws ProcessingException {

	try {
	URL contentURL = new URL(url, url.getFile()
	+ ((url.getFile().indexOf("?") == -1) ? "?" : "&")
	+ contentViewQuery);
	URLConnection contentURLConnection = contentURL.openConnection();
	if (contentURLConnection == null) {
	throw new ProcessingException("Can not open connection to URL "
	+ contentURL + " (null connection)");
	}

	String contentType = contentURLConnection.getContentType();
	if (contentType == null) {
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("Ignoring " + contentURL + " (no content type)");
	}

	return Collections.EMPTY_LIST;
	}

	int index = contentType.indexOf(';');
	if (index != -1) {
	contentType = contentType.substring(0, index);
	}

	if (allowedContentType.contains(contentType)) {
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("Indexing " + contentURL + " (" + contentType + ")");
	}

	LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
	luceneIndexContentHandler.setFieldTags(fieldTags);
	indexDocument(contentURLConnection, luceneIndexContentHandler);
	//
	// document is parsed
	//
	Iterator it = luceneIndexContentHandler.iterator();
	while (it.hasNext()) {
	Document d = (Document) it.next();
	d.add(Field.UnIndexed(URL_FIELD, url.toString()));
	// store ... false, index ... true, token ... false
	d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false));
	}

	return luceneIndexContentHandler.allDocuments();
	} else {
	if (getLogger().isDebugEnabled()) {
	getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")");
	}

	return Collections.EMPTY_LIST;
	}
	} catch (IOException ioe) {
	throw new ProcessingException("Cannot read URL " + url, ioe);
	}
	}


	/**
	* index input stream producing lucene Documents
	*
	* @param contentURLConnection the xml content which should get indexed.
	* @param luceneIndexContentHandler ContentHandler for generating
	* a lucene Document from XML content.
	* @exception ProcessingException Description of Exception
	* @since
	*/
	private void indexDocument(URLConnection contentURLConnection,
	LuceneIndexContentHandler luceneIndexContentHandler)
	throws ProcessingException {

	InputStream is = null;
	InputSource in = null;
	SAXParser parser = null;

	try {
	is = contentURLConnection.getInputStream();
	in = new InputSource(is);

	// get an XML parser
	parser = (SAXParser) this.manager.lookup(SAXParser.ROLE);
	//reader.setErrorHandler(new CocoonErrorHandler());
	parser.parse(in, luceneIndexContentHandler);
	//
	// document is parsed
	//
	} catch (IOException ioe) {
	throw new ProcessingException("Cannot read!", ioe);
	} catch (SAXException saxe) {
	throw new ProcessingException("Cannot parse!", saxe);
	} catch (ServiceException se) {
	throw new ProcessingException("Cannot lookup xml parser!", se);
	} finally {
	if (parser != null) {
	this.manager.release(parser);
	}
	}
	}


	/**
	* return a unique uid of a url connection
	*
	* @param urlConnection Description of Parameter
	* @return String unique uid of a urlConnection
	* @since
	*/
	private String uid(URLConnection urlConnection) {
	// Append path and date into a string in such a way that lexicographic
	// sorting gives the same results as a walk of the file hierarchy. Thus
	// null (\u0000) is used both to separate directory components and to
	// separate the path from the date.
	return urlConnection.toString().replace('/', '\u0000') +
	"\u0000" +
	DateField.timeToString(urlConnection.getLastModified());
	}
	}