blob: ad28f14174b6aeb2daaf6ae14bfd1f47b64abd61 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.lucas;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.*;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
/**
* The <code>ProspectiveSearchAE<code> monitors if one of the defined
* search queries occurs in the processed document, for each matching
* search query a FS is inserted into the CAS.
* <p>
* Optionally the matched text can be marked by a set of annotations, the most
* common use case for this is search term highlighting.
* <p>
* The defined search queries are provided by a user implemented
* {@link SearchQueryProvider}, which could for example retrieve
* the search queries from a database or a web service.
* <p>
* The implementation first indexes the document and then searches all defined
* queries against this one document index, for indexing the Lucene {@link MemoryIndex}
* is used. Notes about the runtime performance can be found in the javadoc of the
* <code>MemoryIndex</code> class.
*
* @see SearchQueryProvider
* @see SearchQuery
* @see MemoryIndex
*/
public class ProspectiveSearchAE extends LuceneDocumentAE {
private SearchQueryProvider searchQueryProvider;
/**
* The search result type. For each matching query one search result feature
* structure will be inserted into the <code>CAS</code>.
* <p>
* The FS must have one long feature to identify the matching query.
* <p>
* Optionally the FS has an array feature which contains annotations which
* mark the matching text of the query in the document to enable hit
* highlighting.
*/
private Type searchResultType;
/**
* The id feature of the search result type.
*/
private Feature searchResultIdFeature;
/**
* The array feature which contains annotations which mark the matching
* text.
*/
private Feature searchResultMatchingTextFeature;
/**
* The type used to mark the matching text.
*/
private Type matchingTextType;
private float matchingThreshold = 0.0f;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException {
super.initialize(aContext);
try {
searchQueryProvider = (SearchQueryProvider) aContext
.getResourceObject("searchQueryProvider");
} catch (ResourceAccessException e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void typeSystemInit(TypeSystem aTypeSystem)
throws AnalysisEngineProcessException {
super.typeSystemInit(aTypeSystem);
String searchResultTypeString = (String) getContext().getConfigParameterValue(
"org.apache.uima.lucas.SearchResultType");
searchResultType = aTypeSystem.getType(searchResultTypeString);
String searchResultIdFeatureString = (String) getContext().getConfigParameterValue(
"org.apache.uima.lucas.SearchResultIdFeature");
searchResultIdFeature = searchResultType.getFeatureByBaseName(searchResultIdFeatureString);
String searchResultMatchingTextFeatureString = (String) getContext().getConfigParameterValue(
"org.apache.uima.lucas.SearchResulMatchingTextFeature");
if (searchResultMatchingTextFeatureString != null) {
searchResultMatchingTextFeature = searchResultType.getFeatureByBaseName(searchResultMatchingTextFeatureString);
String matchingTextTypeString = (String) getContext().getConfigParameterValue(
"org.apache.uima.lucas.MatchingTextType");
if (matchingTextTypeString != null) {
matchingTextType = aTypeSystem.getType(matchingTextTypeString);
}
else {
matchingTextType = aTypeSystem.getType(CAS.TYPE_NAME_ANNOTATION);
}
}
}
@Override
public void process(CAS aCAS)
throws AnalysisEngineProcessException {
// First create the index of the document text
MemoryIndex index = new MemoryIndex();
List fields = createDocument(aCAS).getFields();
for (Iterator it = fields.iterator(); it.hasNext(); ) {
Field field = (Field) it.next();
if (field.isIndexed() && field.tokenStreamValue() != null) {
index.addField(field.name(), field.tokenStreamValue());
}
}
// Search all queries against the one document index
for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) {
float score = index.search(query.query());
if (score > matchingThreshold) {
// Add a FS to the CAS with the search result
FeatureStructure searchResult = aCAS.createFS(searchResultType);
searchResult.setLongValue(searchResultIdFeature, query.id());
aCAS.addFsToIndexes(searchResult);
// Find matching tokens and link their annotations
// in case the user wants search term highlighting
if (searchResultMatchingTextFeature != null) {
fields = createDocument(aCAS).getFields();
for (Iterator it = fields.iterator(); it.hasNext(); ) {
Field field = (Field) it.next();
if (field.isIndexed() && field.tokenStreamValue() != null) {
TokenStream tokenStream = field.tokenStreamValue();
Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>();
QueryScorer scorer = new QueryScorer(query.query(), field.name());
scorer.startFragment(new TextFragment(
new StringBuffer(aCAS.getDocumentText()), 0, 0));
try {
scorer.init(tokenStream);
OffsetAttribute offsetAttr = null;
while (tokenStream.incrementToken()) {
offsetAttr = (OffsetAttribute)tokenStream.getAttribute(OffsetAttribute.class);
float tokenScore = scorer.getTokenScore();
if (tokenScore > 0) {
AnnotationFS annotation = aCAS.createAnnotation(matchingTextType,
offsetAttr.startOffset(), offsetAttr.endOffset());
matchingTextAnnotations.add(annotation);
}
}
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size());
int matchtingTextArrayIndex = 0;
for (AnnotationFS matchingTextAnnotation: matchingTextAnnotations) {
matchtingTextArray.set(matchtingTextArrayIndex++,
matchingTextAnnotation);
}
searchResult.setFeatureValue(searchResultMatchingTextFeature,
matchtingTextArray);
}
}
}
}
}
}
}