blob: 91def8e4d152d6c148f7d66a5e72007454dc6dec [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.entitycomention.impl;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkingStateAware;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ContentItemMentionBuilder extends InMemoryEntityIndex implements LinkingStateAware{
private static final Logger log = LoggerFactory.getLogger(ContentItemMentionBuilder.class);
private static final LiteralFactory lf = LiteralFactory.getInstance();
/**
* The last index notified via {@link #startToken(Token)}
*/
private Integer lastIndex = 0;
private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new TreeMap<Integer,Collection<EntityMention>>();
public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, String...languages){
super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, languages);
}
public void registerTextAnnotation(UriRef textAnnotation, TripleCollection metadata){
String selectedText = EnhancementEngineHelper.getString(metadata, textAnnotation, ENHANCER_SELECTED_TEXT);
if(selectedText != null){
//NOTE: Typically it is not possible to find co-mentions for Entities with a
// single Token, so can ignore those.
// The only exception would be to use proper-nouns for initial linking and
// Nouns for the co-mention resolution. In such cases this might result
// in additional extractions.
String[] tokens = tokenizer.tokenize(selectedText, language);
if(tokens != null && tokens.length > 1){ //TODO make configurable
Double confidence = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
if(confidence == null || confidence > 0.85){ //TODO make configurable
Integer start = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
Integer end = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
registerMention(new EntityMention(textAnnotation,metadata, ENHANCER_SELECTED_TEXT, DC_TYPE,
start != null && end != null ? new Integer[]{start,end} : null));
} // else confidence to low
} else if(tokens == null){
log.warn("Unable to tokenize \"{}\"@{} via tokenizer {} (class: {})!", new Object []{
selectedText,language,tokenizer, tokenizer.getClass().getName()});
} //else ignore Tokens with a single token
} // else no selected text
}
private void registerMention(EntityMention entityMention){
log.debug(" > register {} ",entityMention);
if(entityMention.getStart() == null || entityMention.getStart() < 0){
addEntity(entityMention);
} else {
Collection<EntityMention> mentions = mentionIndex.get(entityMention.getEnd());
if(mentions == null){
mentions = new ArrayList<EntityMention>();
mentionIndex.put(entityMention.getEnd(), mentions);
}
mentions.add(entityMention);
}
}
/**
* Everytime the entityLinker starts to process a token we need to check
* if we need to add additional contextual information from the {@link ContentItem}
* to the {@link InMemoryEntityIndex}
*/
@Override
public void startToken(Token token) {
log.debug(" > start token: {}",token);
final Integer actIndex = token.getStart();
if(actIndex > lastIndex){
for(Collection<EntityMention> mentions : mentionIndex.subMap(lastIndex, actIndex).values()){
for(EntityMention mention : mentions){
addEntity(mention);
}
}
lastIndex = actIndex;
} else if(lastIndex > actIndex){
log.warn("Token {} has earlier start index as the last one {}!", token, lastIndex);
} // else the same index ... ignore
}
@Override
public void startSection(Section sentence) {/* not used */}
@Override
public void endSection(Section sentence) {/* not used */}
@Override
public void endToken(Token token) {/* not used */}
}