Lucas/src/main/java/org/apache/uima/lucas/ProspectiveSearchAE.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.lucas;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.TextFragment;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.*;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.resource.ResourceAccessException;
 import org.apache.uima.resource.ResourceInitializationException;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;

 /**
  * The <code>ProspectiveSearchAE<code> monitors if one of the defined
  * search queries occurs in the processed document, for each matching
  * search query a FS is inserted into the CAS.
  * <p>
  * Optionally the matched text can be marked by a set of annotations, the most
  * common use case for this is search term highlighting.
  * <p>
  * The defined search queries are provided by a user implemented
  * {@link SearchQueryProvider}, which could for example retrieve
  * the search queries from a database or a web service.
  * <p>
  * The implementation first indexes the document and then searches all defined
  * queries against this one document index, for indexing the Lucene {@link MemoryIndex}
  * is used. Notes about the runtime performance can be found in the javadoc of the
  * <code>MemoryIndex</code> class.
  *
  * @see SearchQueryProvider
  * @see SearchQuery
  * @see MemoryIndex
  */
 public class ProspectiveSearchAE extends LuceneDocumentAE {

 	private SearchQueryProvider searchQueryProvider;

 	/**
 	 * The search result type. For each matching query one search result feature
 	 * structure will be inserted into the <code>CAS</code>.
 	 * <p>
 	 * The FS must have one long feature to identify the matching query.
 	 * <p>
 	 * Optionally the FS has an array feature which contains annotations which
 	 * mark the matching text of the query in the document to enable hit
 	 * highlighting.
 	 */
 	private Type searchResultType;

 	/**
 	 * The id feature of the search result type.
 	 */
 	private Feature searchResultIdFeature;

 	/**
 	 * The array feature which contains annotations which mark the matching
 	 * text.
 	 */
 	private Feature searchResultMatchingTextFeature;

 	/**
 	 * The type used to mark the matching text.
 	 */
 	private Type matchingTextType;

 	private float matchingThreshold = 0.0f;

 	@Override
 	public void initialize(UimaContext aContext)
 			throws ResourceInitializationException {
 		super.initialize(aContext);

 		try {
 			searchQueryProvider = (SearchQueryProvider) aContext
 					.getResourceObject("searchQueryProvider");
 		} catch (ResourceAccessException e) {
 			throw new ResourceInitializationException(e);
 		}
 	}

 	@Override
 	public void typeSystemInit(TypeSystem aTypeSystem)
 			throws AnalysisEngineProcessException {
 		super.typeSystemInit(aTypeSystem);

 		String searchResultTypeString = (String) getContext().getConfigParameterValue(
                 "org.apache.uima.lucas.SearchResultType");
 		searchResultType = aTypeSystem.getType(searchResultTypeString);

 		String searchResultIdFeatureString = (String) getContext().getConfigParameterValue(
                 "org.apache.uima.lucas.SearchResultIdFeature");
 		searchResultIdFeature =  searchResultType.getFeatureByBaseName(searchResultIdFeatureString);

 		String searchResultMatchingTextFeatureString = (String) getContext().getConfigParameterValue(
 				"org.apache.uima.lucas.SearchResulMatchingTextFeature");
 		if (searchResultMatchingTextFeatureString != null) {
 			searchResultMatchingTextFeature = searchResultType.getFeatureByBaseName(searchResultMatchingTextFeatureString);

 			String matchingTextTypeString = (String) getContext().getConfigParameterValue(
 					"org.apache.uima.lucas.MatchingTextType");

 			if (matchingTextTypeString != null) {
 				matchingTextType = aTypeSystem.getType(matchingTextTypeString);
 			}
 			else {
 				matchingTextType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION);
 			}
 		}
 	}

 	@Override
 	public void process(CAS aCAS)
 			throws AnalysisEngineProcessException {

 		// First create the index of the document text
 		MemoryIndex index = new MemoryIndex();

 		List fields = createDocument(aCAS).getFields();

 		for (Iterator it = fields.iterator(); it.hasNext(); ) {
 			Field field = (Field) it.next();

 			if (field.isIndexed() && field.tokenStreamValue() != null) {
 			  index.addField(field.name(), field.tokenStreamValue());
 			}
 		}

 		// Search all queries against the one document index
 		for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

 			float score = index.search(query.query());

 			if (score > matchingThreshold) {

 				// Add a FS to the CAS with the search result
 				FeatureStructure searchResult = aCAS.createFS(searchResultType);
 				searchResult.setLongValue(searchResultIdFeature, query.id());
 				aCAS.addFsToIndexes(searchResult);

 				// Find matching tokens and link their annotations
 				// in case the user wants search term highlighting
 				if (searchResultMatchingTextFeature != null) {

 					fields = createDocument(aCAS).getFields();

 					for (Iterator it = fields.iterator(); it.hasNext(); ) {

 						Field field = (Field) it.next();

 						if (field.isIndexed() && field.tokenStreamValue() != null) {

 							TokenStream tokenStream = field.tokenStreamValue();

 							Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

 							QueryScorer scorer = new QueryScorer(query.query(), field.name());
 							scorer.startFragment(new TextFragment(
 									new StringBuffer(aCAS.getDocumentText()), 0, 0));

 							try {
 								scorer.init(tokenStream);

 								OffsetAttribute offsetAttr = null;
 								while (tokenStream.incrementToken()) {
 									offsetAttr = (OffsetAttribute)tokenStream.getAttribute(OffsetAttribute.class);
 									float tokenScore = scorer.getTokenScore();
 									if (tokenScore > 0) {
 										AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
 												offsetAttr.startOffset(), offsetAttr.endOffset());

 										matchingTextAnnotations.add(annotation);
 									}
 								}
 							}
 							catch (IOException e) {
 								throw new AnalysisEngineProcessException(e);
 							}

 							ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

 							int matchtingTextArrayIndex = 0;
 							for (AnnotationFS matchingTextAnnotation: matchingTextAnnotations) {
 								matchtingTextArray.set(matchtingTextArrayIndex++,
 										matchingTextAnnotation);
 							}

 							searchResult.setFeatureValue(searchResultMatchingTextFeature,
 									matchtingTextArray);
 						}
 					}
 				}
 			}
 		}
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.lucas;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.index.memory.MemoryIndex;
	import org.apache.lucene.search.highlight.QueryScorer;
	import org.apache.lucene.search.highlight.TextFragment;
	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.cas.*;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.resource.ResourceAccessException;
	import org.apache.uima.resource.ResourceInitializationException;

	import java.io.IOException;
	import java.util.Collection;
	import java.util.Iterator;
	import java.util.LinkedList;
	import java.util.List;

	/**
	* The <code>ProspectiveSearchAE<code> monitors if one of the defined
	* search queries occurs in the processed document, for each matching
	* search query a FS is inserted into the CAS.
	* <p>
	* Optionally the matched text can be marked by a set of annotations, the most
	* common use case for this is search term highlighting.
	* <p>
	* The defined search queries are provided by a user implemented
	* {@link SearchQueryProvider}, which could for example retrieve
	* the search queries from a database or a web service.
	* <p>
	* The implementation first indexes the document and then searches all defined
	* queries against this one document index, for indexing the Lucene {@link MemoryIndex}
	* is used. Notes about the runtime performance can be found in the javadoc of the
	* <code>MemoryIndex</code> class.
	*
	* @see SearchQueryProvider
	* @see SearchQuery
	* @see MemoryIndex
	*/
	public class ProspectiveSearchAE extends LuceneDocumentAE {

	private SearchQueryProvider searchQueryProvider;

	/**
	* The search result type. For each matching query one search result feature
	* structure will be inserted into the <code>CAS</code>.
	* <p>
	* The FS must have one long feature to identify the matching query.
	* <p>
	* Optionally the FS has an array feature which contains annotations which
	* mark the matching text of the query in the document to enable hit
	* highlighting.
	*/
	private Type searchResultType;

	/**
	* The id feature of the search result type.
	*/
	private Feature searchResultIdFeature;

	/**
	* The array feature which contains annotations which mark the matching
	* text.
	*/
	private Feature searchResultMatchingTextFeature;

	/**
	* The type used to mark the matching text.
	*/
	private Type matchingTextType;

	private float matchingThreshold = 0.0f;

	@Override
	public void initialize(UimaContext aContext)
	throws ResourceInitializationException {
	super.initialize(aContext);

	try {
	searchQueryProvider = (SearchQueryProvider) aContext
	.getResourceObject("searchQueryProvider");
	} catch (ResourceAccessException e) {
	throw new ResourceInitializationException(e);
	}
	}

	@Override
	public void typeSystemInit(TypeSystem aTypeSystem)
	throws AnalysisEngineProcessException {
	super.typeSystemInit(aTypeSystem);

	String searchResultTypeString = (String) getContext().getConfigParameterValue(
	"org.apache.uima.lucas.SearchResultType");
	searchResultType = aTypeSystem.getType(searchResultTypeString);

	String searchResultIdFeatureString = (String) getContext().getConfigParameterValue(
	"org.apache.uima.lucas.SearchResultIdFeature");
	searchResultIdFeature = searchResultType.getFeatureByBaseName(searchResultIdFeatureString);

	String searchResultMatchingTextFeatureString = (String) getContext().getConfigParameterValue(
	"org.apache.uima.lucas.SearchResulMatchingTextFeature");
	if (searchResultMatchingTextFeatureString != null) {
	searchResultMatchingTextFeature = searchResultType.getFeatureByBaseName(searchResultMatchingTextFeatureString);

	String matchingTextTypeString = (String) getContext().getConfigParameterValue(
	"org.apache.uima.lucas.MatchingTextType");

	if (matchingTextTypeString != null) {
	matchingTextType = aTypeSystem.getType(matchingTextTypeString);
	}
	else {
	matchingTextType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION);
	}
	}
	}

	@Override
	public void process(CAS aCAS)
	throws AnalysisEngineProcessException {

	// First create the index of the document text
	MemoryIndex index = new MemoryIndex();

	List fields = createDocument(aCAS).getFields();

	for (Iterator it = fields.iterator(); it.hasNext(); ) {
	Field field = (Field) it.next();

	if (field.isIndexed() && field.tokenStreamValue() != null) {
	index.addField(field.name(), field.tokenStreamValue());
	}
	}

	// Search all queries against the one document index
	for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {

	float score = index.search(query.query());

	if (score > matchingThreshold) {

	// Add a FS to the CAS with the search result
	FeatureStructure searchResult = aCAS.createFS(searchResultType);
	searchResult.setLongValue(searchResultIdFeature, query.id());
	aCAS.addFsToIndexes(searchResult);

	// Find matching tokens and link their annotations
	// in case the user wants search term highlighting
	if (searchResultMatchingTextFeature != null) {

	fields = createDocument(aCAS).getFields();

	for (Iterator it = fields.iterator(); it.hasNext(); ) {

	Field field = (Field) it.next();

	if (field.isIndexed() && field.tokenStreamValue() != null) {

	TokenStream tokenStream = field.tokenStreamValue();

	Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();

	QueryScorer scorer = new QueryScorer(query.query(), field.name());
	scorer.startFragment(new TextFragment(
	new StringBuffer(aCAS.getDocumentText()), 0, 0));

	try {
	scorer.init(tokenStream);

	OffsetAttribute offsetAttr = null;
	while (tokenStream.incrementToken()) {
	offsetAttr = (OffsetAttribute)tokenStream.getAttribute(OffsetAttribute.class);
	float tokenScore = scorer.getTokenScore();
	if (tokenScore > 0) {
	AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
	offsetAttr.startOffset(), offsetAttr.endOffset());

	matchingTextAnnotations.add(annotation);
	}
	}
	}
	catch (IOException e) {
	throw new AnalysisEngineProcessException(e);
	}

	ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());

	int matchtingTextArrayIndex = 0;
	for (AnnotationFS matchingTextAnnotation: matchingTextAnnotations) {
	matchtingTextArray.set(matchtingTextArrayIndex++,
	matchingTextAnnotation);
	}

	searchResult.setFeatureValue(searchResultMatchingTextFeature,
	matchtingTextArray);
	}
	}
	}
	}
	}
	}
	}