enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ContentItemMentionBuilder.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.entitycomention.impl;

 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;

 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
 import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkingStateAware;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class ContentItemMentionBuilder extends InMemoryEntityIndex implements LinkingStateAware{

     private static final Logger log = LoggerFactory.getLogger(ContentItemMentionBuilder.class);
     private static final LiteralFactory lf = LiteralFactory.getInstance();

     /**
      * The last index notified via {@link #startToken(Token)}
      */
     private Integer lastIndex = 0;

     private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new TreeMap<Integer,Collection<EntityMention>>();

     public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, String...languages){
         super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, languages);
     }

     public void registerTextAnnotation(UriRef textAnnotation, TripleCollection metadata){
         String selectedText = EnhancementEngineHelper.getString(metadata, textAnnotation, ENHANCER_SELECTED_TEXT);
         if(selectedText != null){
             //NOTE: Typically it is not possible to find co-mentions for Entities with a
             //      single Token, so can ignore those.
             //      The only exception would be to use proper-nouns for initial linking and
             //      Nouns for the co-mention resolution. In such cases this might result
             //      in additional extractions.
             String[] tokens = tokenizer.tokenize(selectedText, language);
             if(tokens != null && tokens.length > 1){ //TODO make configurable
                 Double confidence = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
                 if(confidence == null || confidence > 0.85){ //TODO make configurable
                     Integer start = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
                     Integer end = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
                     registerMention(new EntityMention(textAnnotation,metadata, ENHANCER_SELECTED_TEXT, DC_TYPE,
                         start != null && end != null ? new Integer[]{start,end} : null));
                 } // else confidence to low
             } else if(tokens == null){
                 log.warn("Unable to tokenize \"{}\"@{} via tokenizer {} (class: {})!", new Object []{
                     selectedText,language,tokenizer, tokenizer.getClass().getName()});
             } //else ignore Tokens with a single token
         } // else no selected text
     }

     private void registerMention(EntityMention entityMention){
         log.debug(" > register {} ",entityMention);
         if(entityMention.getStart() == null || entityMention.getStart() < 0){
             addEntity(entityMention);
         } else {
             Collection<EntityMention> mentions = mentionIndex.get(entityMention.getEnd());
             if(mentions == null){
                 mentions = new ArrayList<EntityMention>();
                 mentionIndex.put(entityMention.getEnd(), mentions);
             }
             mentions.add(entityMention);
         }
     }

     /**
      * Everytime the entityLinker starts to process a token we need to check
      * if we need to add additional contextual information from the {@link ContentItem}
      * to the {@link InMemoryEntityIndex}
      */
     @Override
     public void startToken(Token token) {
         log.debug(" > start token: {}",token);
         final Integer actIndex = token.getStart();
         if(actIndex > lastIndex){
             for(Collection<EntityMention> mentions : mentionIndex.subMap(lastIndex, actIndex).values()){
                 for(EntityMention mention : mentions){
                     addEntity(mention);
                 }
             }
             lastIndex = actIndex;
         } else if(lastIndex > actIndex){
             log.warn("Token {} has earlier start index as the last one {}!", token, lastIndex);
         } // else the same index ... ignore
     }

     @Override
     public void startSection(Section sentence) {/* not used */}
     @Override
     public void endSection(Section sentence) {/* not used */}
     @Override
     public void endToken(Token token) {/* not used */}


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.entitycomention.impl;

	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;

	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.Set;
	import java.util.SortedMap;
	import java.util.TreeMap;

	import org.apache.clerezza.rdf.core.LiteralFactory;
	import org.apache.clerezza.rdf.core.MGraph;
	import org.apache.clerezza.rdf.core.NonLiteral;
	import org.apache.clerezza.rdf.core.Triple;
	import org.apache.clerezza.rdf.core.TripleCollection;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
	import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
	import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkingStateAware;
	import org.apache.stanbol.enhancer.nlp.model.Section;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class ContentItemMentionBuilder extends InMemoryEntityIndex implements LinkingStateAware{

	private static final Logger log = LoggerFactory.getLogger(ContentItemMentionBuilder.class);
	private static final LiteralFactory lf = LiteralFactory.getInstance();

	/**
	* The last index notified via {@link #startToken(Token)}
	*/
	private Integer lastIndex = 0;

	private SortedMap<Integer,Collection<EntityMention>> mentionIndex = new TreeMap<Integer,Collection<EntityMention>>();

	public ContentItemMentionBuilder(LabelTokenizer labelTokenizer, String...languages){
	super(labelTokenizer,CoMentionConstants.CO_MENTION_LABEL_FIELD, languages);
	}

	public void registerTextAnnotation(UriRef textAnnotation, TripleCollection metadata){
	String selectedText = EnhancementEngineHelper.getString(metadata, textAnnotation, ENHANCER_SELECTED_TEXT);
	if(selectedText != null){
	//NOTE: Typically it is not possible to find co-mentions for Entities with a
	// single Token, so can ignore those.
	// The only exception would be to use proper-nouns for initial linking and
	// Nouns for the co-mention resolution. In such cases this might result
	// in additional extractions.
	String[] tokens = tokenizer.tokenize(selectedText, language);
	if(tokens != null && tokens.length > 1){ //TODO make configurable
	Double confidence = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_CONFIDENCE,Double.class,lf);
	if(confidence == null \|\| confidence > 0.85){ //TODO make configurable
	Integer start = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_START,Integer.class,lf);
	Integer end = EnhancementEngineHelper.get(metadata,textAnnotation,ENHANCER_END,Integer.class,lf);
	registerMention(new EntityMention(textAnnotation,metadata, ENHANCER_SELECTED_TEXT, DC_TYPE,
	start != null && end != null ? new Integer[]{start,end} : null));
	} // else confidence to low
	} else if(tokens == null){
	log.warn("Unable to tokenize \"{}\"@{} via tokenizer {} (class: {})!", new Object []{
	selectedText,language,tokenizer, tokenizer.getClass().getName()});
	} //else ignore Tokens with a single token
	} // else no selected text
	}

	private void registerMention(EntityMention entityMention){
	log.debug(" > register {} ",entityMention);
	if(entityMention.getStart() == null \|\| entityMention.getStart() < 0){
	addEntity(entityMention);
	} else {
	Collection<EntityMention> mentions = mentionIndex.get(entityMention.getEnd());
	if(mentions == null){
	mentions = new ArrayList<EntityMention>();
	mentionIndex.put(entityMention.getEnd(), mentions);
	}
	mentions.add(entityMention);
	}
	}

	/**
	* Everytime the entityLinker starts to process a token we need to check
	* if we need to add additional contextual information from the {@link ContentItem}
	* to the {@link InMemoryEntityIndex}
	*/
	@Override
	public void startToken(Token token) {
	log.debug(" > start token: {}",token);
	final Integer actIndex = token.getStart();
	if(actIndex > lastIndex){
	for(Collection<EntityMention> mentions : mentionIndex.subMap(lastIndex, actIndex).values()){
	for(EntityMention mention : mentions){
	addEntity(mention);
	}
	}
	lastIndex = actIndex;
	} else if(lastIndex > actIndex){
	log.warn("Token {} has earlier start index as the last one {}!", token, lastIndex);
	} // else the same index ... ignore
	}

	@Override
	public void startSection(Section sentence) {/* not used */}
	@Override
	public void endSection(Section sentence) {/* not used */}
	@Override
	public void endToken(Token token) {/* not used */}


	}