| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitycomention.impl; |
| |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; |
| import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Set; |
| import java.util.SortedMap; |
| import java.util.TreeMap; |
| |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.NonLiteral; |
| import org.apache.clerezza.rdf.core.Triple; |
| import org.apache.clerezza.rdf.core.TripleCollection; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants; |
| import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer; |
| import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkingStateAware; |
| import org.apache.stanbol.enhancer.nlp.model.Section; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| public class ContentItemMentionBuilder extends InMemoryEntityIndex implements LinkingStateAware{ |
| |
| private static final Logger log = LoggerFactory.getLogger(ContentItemMentionBuilder.class); |
| private static final LiteralFactory lf = LiteralFactory.getInstance(); |
| |
| /** |
| * The last index notified via {@link #startToken(Token)} |
| */ |
| private Integer lastIndex = 0; |
| |
| private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new TreeMap<Integer,Collection<EntityMention>>(); |
| |
| public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, String...languages){ |
| super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, languages); |
| } |
| |
| public void registerTextAnnotation(UriRef textAnnotation, TripleCollection metadata){ |
| String selectedText = EnhancementEngineHelper.getString(metadata, textAnnotation, ENHANCER_SELECTED_TEXT); |
| if(selectedText != null){ |
| //NOTE: Typically it is not possible to find co-mentions for Entities with a |
| // single Token, so can ignore those. |
| // The only exception would be to use proper-nouns for initial linking and |
| // Nouns for the co-mention resolution. In such cases this might result |
| // in additional extractions. |
| String[] tokens = tokenizer.tokenize(selectedText, language); |
| if(tokens != null && tokens.length > 1){ //TODO make configurable |
| Double confidence = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf); |
| if(confidence == null || confidence > 0.85){ //TODO make configurable |
| Integer start = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf); |
| Integer end = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf); |
| registerMention(new EntityMention(textAnnotation,metadata, ENHANCER_SELECTED_TEXT, DC_TYPE, |
| start != null && end != null ? new Integer[]{start,end} : null)); |
| } // else confidence to low |
| } else if(tokens == null){ |
| log.warn("Unable to tokenize \"{}\"@{} via tokenizer {} (class: {})!", new Object []{ |
| selectedText,language,tokenizer, tokenizer.getClass().getName()}); |
| } //else ignore Tokens with a single token |
| } // else no selected text |
| } |
| |
| private void registerMention(EntityMention entityMention){ |
| log.debug(" > register {} ",entityMention); |
| if(entityMention.getStart() == null || entityMention.getStart() < 0){ |
| addEntity(entityMention); |
| } else { |
| Collection<EntityMention> mentions = mentionIndex.get(entityMention.getEnd()); |
| if(mentions == null){ |
| mentions = new ArrayList<EntityMention>(); |
| mentionIndex.put(entityMention.getEnd(), mentions); |
| } |
| mentions.add(entityMention); |
| } |
| } |
| |
| /** |
| * Everytime the entityLinker starts to process a token we need to check |
| * if we need to add additional contextual information from the {@link ContentItem} |
| * to the {@link InMemoryEntityIndex} |
| */ |
| @Override |
| public void startToken(Token token) { |
| log.debug(" > start token: {}",token); |
| final Integer actIndex = token.getStart(); |
| if(actIndex > lastIndex){ |
| for(Collection<EntityMention> mentions : mentionIndex.subMap(lastIndex, actIndex).values()){ |
| for(EntityMention mention : mentions){ |
| addEntity(mention); |
| } |
| } |
| lastIndex = actIndex; |
| } else if(lastIndex > actIndex){ |
| log.warn("Token {} has earlier start index as the last one {}!", token, lastIndex); |
| } // else the same index ... ignore |
| } |
| |
| @Override |
| public void startSection(Section sentence) {/* not used */} |
| @Override |
| public void endSection(Section sentence) {/* not used */} |
| @Override |
| public void endToken(Token token) {/* not used */} |
| |
| |
| } |