blob: a6d2d419a1eadc41e22f4103998a9933bee5793d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.lucenefstlinking;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION;
import java.io.IOException;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NavigableMap;
import java.util.Set;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.opensextant.solrtexttagger.TagClusterReducer;
import org.opensextant.solrtexttagger.TagLL;
import org.opensextant.solrtexttagger.TaggingAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class that ensures that only Tokens within a {@link Chunk} with a
* {@link NerTag} are processed.<p>
* This is ensured on two places:<ol>
* <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute}
* based on {@link NerTag}s present in the {@link AnalysedText}.<p>
* <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL tags}
* that do not cover the whole Named Entity are removed from the Cluster.
* </ol>
* <b> Implementation Details</b><p>
* The {@link TokenStream} implementation of this class does set
* <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap
* with a {@link Chunk} having an {@link NerTag}
* <p>
* The {@link TagClusterReducer} implementation keeps track of Chunks with
* {@link NerTag} while iterating over the {@link TokenStream} and adds them to
* the end of a List. When {@link TagClusterReducer#reduce(TagLL[])} is called
* tags of the cluster are checked if they do cover Chunks with a {@link NerTag}.
* If they do not they are removed from the cluster.
* <p>
* This implementation was derived from the {@link LinkableTokenFilter}
*
* @author Rupert Westenthaler
*
*/
public final class NamedEntityTokenFilter extends TokenFilter implements TagClusterReducer{
private final Logger log = LoggerFactory.getLogger(NamedEntityTokenFilter.class);
/**
* The NLP processing results
*/
private AnalysedText at;
/**
* The language of the text
*/
/**
* Iterator over all {@link Chunk}s in the {@link AnalysedText} that do
* have an {@link NerTag}
*/
private Iterator<Chunk> neChunks;
protected final CharTermAttribute termAtt;
protected final OffsetAttribute offset;
protected final TaggingAttribute taggable;
private int lookupCount = 0;
private int incrementCount = 0;
/**
* List with {@link Chunk}s having {@link NerTag}s. This is used by
* the {@link #reduce(TagLL[])} method to check if {@link TagLL tags}
* do cover Named Entities detected in the text.
*/
private List<Chunk> nePhrases;
private final NavigableMap<int[],Set<String>> nePhrasesTypes;
private Chunk neChunk;
protected final boolean wildcardType;
protected final Set<String> neTypes;
/**
* A Token Filter for Named Entities of the configured types. Also collects
* '<code>span -&gt type</code>' mappings for Named Entities.
* @param input the input token stream for the parsed text
* @param at the {@link AnalysedText} containing {@link NerTag} values
* @param lang the language of the text
* @param neTypes the string {@link NerTag#getType()} and {@link NerTag#getTag()}
* values of enabled Named Entities. If <code>null</code> or containing the
* <code>null</code> element all types will be accepted.
* @param nePhrasesTypes The {@link NavigableMap} used to store the spans of
* named entities as key and the set o their {@link NerTag#getTag()} and
* {@link NerTag#getType()} as values. Those information are collected while
* iterating over the text (by the {@link NamedEntityPredicate}) and are
* used later for filtering {@link Match}es based on the type of the Entities.
* Typically the {@link TaggingSession#entityMentionTypes} is parsed as this
* parameter.
*/
protected NamedEntityTokenFilter(TokenStream input, AnalysedText at, String lang,
Set<String> neTypes, NavigableMap<int[],Set<String>> nePhrasesTypes) {
super(input);
//STANBOL-1177: add attributes in doPrivileged to avoid
//AccessControlException: access denied ("java.lang.RuntimePermission" "getClassLoader")
termAtt = AccessController.doPrivileged(new PrivilegedAction<CharTermAttribute>() {
@Override public CharTermAttribute run() {
return addAttribute(CharTermAttribute.class);
}});
offset = AccessController.doPrivileged(new PrivilegedAction<OffsetAttribute>() {
@Override public OffsetAttribute run() {
return addAttribute(OffsetAttribute.class);
}});
taggable = AccessController.doPrivileged(new PrivilegedAction<TaggingAttribute>() {
@Override public TaggingAttribute run() {
return addAttribute(TaggingAttribute.class);
}});
this.at = at;
this.wildcardType = neTypes == null || neTypes.contains(null);
this.neTypes = neTypes;
this.nePhrasesTypes = nePhrasesTypes;
}
@SuppressWarnings("unchecked")
@Override
public void reset() throws IOException {
super.reset();
nePhrases = new LinkedList<Chunk>();
neChunks = new FilterIterator(at.getChunks(), new NamedEntityPredicate());
}
@Override
public boolean incrementToken() throws IOException {
if(input.incrementToken()){
incrementCount++;
if(log.isTraceEnabled()){
log.trace("> solr:[{},{}] {}",new Object[]{
offset.startOffset(), offset.endOffset(), termAtt});
}
while((neChunk == null || neChunk.getEnd() < offset.startOffset()) && neChunks.hasNext()){
neChunk = neChunks.next();
nePhrases.add(neChunk);
}
if(neChunk == null){
taggable.setTaggable(false);
incrementCount++;
log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount);
return false;
} else if(offset.endOffset() > neChunk.getStart()
|| offset.startOffset() < neChunk.getEnd()){
//set tagable to true if the tokens overlapps with the current chunk
taggable.setTaggable(true);
if(log.isTraceEnabled()){
log.trace("lookup: token [{},{}]: {} | named Entity [{},{}]:{}",
new Object[]{ offset.startOffset(), offset.endOffset(),
termAtt, neChunk.getStart(), neChunk.getEnd(),
neChunk.getSpan()});
}
lookupCount++;
} else {
taggable.setTaggable(false);
}
incrementCount++;
return true;
} else { //no more tokens in the parent token stream
return false;
}
}
@Override
public void reduce(TagLL[] head) {
//(1) reduce Tags based on named entity phrases.
for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
Chunk nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0);
while(nePhrase != null && nePhrase.getEnd() <= start){
nePhrases.remove(0);
nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0);
}
if(nePhrase == null || !(start <= nePhrase.getStart() && end >= nePhrase.getEnd())){
//does not cover any named entity phrase
tag.removeLL(); //remove the tag from the cluster
if(log.isTraceEnabled()){
log.trace(" > reduce tag {} - does not cover {}", tag, nePhrase);
}
} else if(log.isTraceEnabled()) {//the current Tag coveres a named entity phrase
log.trace(" > keep tag {} for {}", tag, nePhrase);
}
}
}
/**
* {@link Predicate} used to select Named Entities based on matching
* {@link NerTag#getTag()} and {@link NerTag#getType()} values against the
* {@link NamedEntityTokenFilter#neTypes} configuration. As a side effect
* this also collects the {@link NamedEntityTokenFilter#nePhrasesTypes}
* information. This avoids a 2nd pass over the {@link AnalysedText} to
* collect those information
* @author Rupert Westenthaler
*
*/
final class NamedEntityPredicate implements Predicate {
@Override
public boolean evaluate(Object o) {
if(o instanceof Chunk){
Chunk chunk = (Chunk)o;
Value<NerTag> nerValue = chunk.getAnnotation(NER_ANNOTATION);
if(nerValue != null){
NerTag nerTag = nerValue.value();
String nerType = nerTag.getType() != null ?
nerTag.getType().getUnicodeString() : null;
if( wildcardType || neTypes.contains(nerTag.getTag())
|| (nerType != null && neTypes.contains(nerType))){
int[] span = new int[]{chunk.getStart(), chunk.getEnd()};
Set<String> types = nePhrasesTypes.get(span);
if(types == null){
types = new HashSet<String>(4);
nePhrasesTypes.put(span, types);
}
types.add(nerType);
types.add(nerTag.getTag());
return true;
}
}
}
return false;
}
}
}