enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.nlp.utils;

 import static java.util.Collections.singleton;

 import java.io.IOException;
 import java.util.Dictionary;
 import java.util.Map;
 import java.util.Map.Entry;

 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
 import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Utility class for {@link EnhancementEngine} implementations that
  * do use the {@link AnalysedText} content part
  * @author Rupert Westenthaler
  *
  */
 public final class NlpEngineHelper {

     private static final Logger log = LoggerFactory.getLogger(NlpEngineHelper.class);

     private NlpEngineHelper(){}


     /**
      * Getter for the AnalysedText for a ContentItem
      * @param engine the EnhancementEngine calling this method (used for logging)
      * @param ci the ContentItem
      * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
      * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
      * @return the AnalysedText or <code>null</code> if not found.
      * @throws IllegalStateException if exception is <code>true</code> and the
      * {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}.
      */
     public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) {
         AnalysedText at;
         try {
             at = AnalysedTextUtils.getAnalysedText(ci);
         }catch (RuntimeException e) {
             log.warn("Unable to retrieve AnalysedText for ContentItem "
                 + ci + "because of an "+e.getClass().getSimpleName()+" with message "
                 + e.getMessage(),e);
             at = null;
         }
         if(at != null){
             return at;
         }
         if(exception){
             throw new IllegalStateException("Unable to retrieve AnalysedText from ContentItem "
                     + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
                     + "used EnhancementJobManager!");
         } else {
             log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
                     + "ContentItem {} because the AnalysedText ContentPart is "
                     + "missing. Users might want to add an EnhancementEngine that "
                     + "creates the AnalysedText ContentPart such as the "
                     + "POSTaggingEngine (o.a.stanbol.enhancer.engines.opennlp.pos)!",
                     new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
             return null;
         }
     }

     /**
      * Getter for the language of the content
      * @param ci the ContentItem
      * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
      * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
      * @return the AnalysedText or <code>null</code> if not found.
      * @throws IllegalStateException if exception is <code>true</code> and the
      * language could not be retrieved from the parsed {@link ContentItem}.
      */
     public static String getLanguage(EnhancementEngine engine, ContentItem ci, boolean exception) {
         String language = EnhancementEngineHelper.getLanguage(ci);
         if(language != null) {
             return language;
         }
         if(exception){
             throw new IllegalStateException("Unable to retrieve the detected language for ContentItem "
                     + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
                     + "used EnhancementJobManager!");
         } else {
             log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
                     + "ContentItem {} because the langauge of "
                     + "this ContentItem is unknown. Users might want to add a "
                     + "Language Identification EnhancementEngine to the current "
                     + "EnhancementChain!",
                     new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
             return null;
         }
     }
     /**
      * Used in {@link #canEnhance(ContentItem)} to check if a {@link ContentItem}
      * should be processed based on the language configuration of this engine.
      * @param engine the {@link EnhancementEngine} calling this method
      * @param languageConfiguration the language configuration
      * @param language the language
      * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
      * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
      * @return the state
      * @throws IllegalStateException if exception is <code>true</code> and the
      * language is not configured as beeing processed.
      */
     public static boolean isLangaugeConfigured(EnhancementEngine engine, LanguageConfiguration languageConfiguration, String language, boolean exception){
         boolean state = languageConfiguration.isLanguage(language);
         if(!state && exception){
             throw new IllegalStateException("Language "+language+" is not included "
                     + "by the LanguageConfiguration of this engine (name "+ engine.getName()
                     + "). As this is also checked in canEnhancer this may indicate an Bug in the "
                     + "used EnhancementJobManager!");
         } else {
             return state;
         }
     }

     /**
      * Retrieves - or if not present - creates the {@link AnalysedText} content
      * part for the parsed {@link ContentItem}. If the {@link Blob} with the
      * mime type '<code>text/plain</code>' is present this method
      * throws an {@link IllegalStateException} (this method internally uses
      * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
      * <code>true</code> as third parameters. Users of this method should call
      * this method with <code>false</code> as third parameter in their
      * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
      * <i>NOTE:</i> This method is intended for Engines that want to create an
      * empty {@link AnalysedText} content part. Engines that assume that this
      * content part is already present (e.g. if the consume already existing
      * annotations) should use the
      * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
      * method instead.
      * @param engine the EnhancementEngine calling this method (used for logging)
      * @param analysedTextFactory the {@link AnalysedTextFactory} used to create
      * the {@link AnalysedText} instance (if not present).
      * @param ci the {@link ContentItem}
      * @return the AnalysedText
      * @throws EngineException on any exception while accessing the
      * '<code>text/plain</code>' Blob
      * @throws IllegalStateException if no '<code>text/plain</code>' Blob is
      * present as content part of the parsed {@link ContentItem} or the parsed
      * {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that
      * {@link IllegalStateException} are only thrown if the {@link AnalysedText}
      * ContentPart is not yet present in the parsed {@link ContentItem}
      */
     public static AnalysedText initAnalysedText(EnhancementEngine engine,
                                                 AnalysedTextFactory analysedTextFactory,
                                                 ContentItem ci) throws EngineException {
         AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
         if(at == null){
             if(analysedTextFactory == null){
                 throw new IllegalStateException("Unable to initialise AnalysedText"
                     + "ContentPart because the parsed AnalysedTextFactory is NULL");
             }
             Entry<UriRef,Blob> textBlob = getPlainText(engine, ci, true);
             //we need to create
             ci.getLock().writeLock().lock();
             try {
                 //try again to retrieve (maybe an concurrent thread has created
                 //the content part in the meantime
                 at = AnalysedTextUtils.getAnalysedText(ci);
                 if(at == null){
                     log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
                     at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
                 }
             } catch (IOException e) {
                 throw new EngineException("Unable to create AnalysetText instance for Blob "
                     + textBlob.getKey()+ " of ContentItem "+ci.getUri()+"!",e);
             } finally {
                 ci.getLock().writeLock().unlock();
             }
         } else {
             log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
         }
         return at;
     }

     /**
      * Getter for the language of the content
      * @param ci the ContentItem
      * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
      * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
      * @return the AnalysedText or <code>null</code> if not found.
      * @throws IllegalStateException if exception is <code>true</code> and the
      * language could not be retrieved from the parsed {@link ContentItem}.
      */
     public static Entry<UriRef,Blob> getPlainText(EnhancementEngine engine, ContentItem ci, boolean exception) {
         Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(
             ci, singleton("text/plain"));
         if(textBlob != null) {
             return textBlob;
         }
         if(exception){
             throw new IllegalStateException("Unable to retrieve 'text/plain' ContentPart for ContentItem "
                     + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
                     + "used EnhancementJobManager!");
         } else {
             log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
                     + "ContentItem {} because no 'text/plain' ContentPart is "
                     + "present in this ContentItem. Users that need to enhance "
                     + "non-plain-text Content need to add an EnhancementEngine "
                     + "that supports the conversion of '{}' files to plain text "
                     + "to the current EnhancementChain!",
                     new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci,ci.getMimeType()});
             return null;
         }
     }
     /**
      * Parsed the {@link NlpProcessingRole} typically provided by the
      * {@link ServiceProperties#getServiceProperties()} provided by some
      * EnhancementEngines.<p>
      * This supports both {@link NlpProcessingRole} as well as String values
      * using the {@link NlpProcessingRole#name()}.
      * @param properties the properties (typically retrieved from the
      * {@link ServiceProperties#getServiceProperties()} method)
      * @return the NLP processing role or <code>null</code> if not present OR
      * an error while parsing.
      */
     public static NlpProcessingRole getNlpProcessingRole(Map<String,Object> properties){
         Object value = properties.get(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE);
         if(value instanceof NlpProcessingRole){
             return (NlpProcessingRole)value;
         } else if(value != null){
             try {
                 return NlpProcessingRole.valueOf(value.toString());
             } catch (IllegalArgumentException e) {
                 log.warn("Unknown NLP processing role {} -> return null",value);
                 return null;
             }
         } else {
             return null;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.nlp.utils;

	import static java.util.Collections.singleton;

	import java.io.IOException;
	import java.util.Dictionary;
	import java.util.Map;
	import java.util.Map.Entry;

	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
	import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
	import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Utility class for {@link EnhancementEngine} implementations that
	* do use the {@link AnalysedText} content part
	* @author Rupert Westenthaler
	*
	*/
	public final class NlpEngineHelper {

	private static final Logger log = LoggerFactory.getLogger(NlpEngineHelper.class);

	private NlpEngineHelper(){}


	/**
	* Getter for the AnalysedText for a ContentItem
	* @param engine the EnhancementEngine calling this method (used for logging)
	* @param ci the ContentItem
	* @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
	* and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
	* @return the AnalysedText or <code>null</code> if not found.
	* @throws IllegalStateException if exception is <code>true</code> and the
	* {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}.
	*/
	public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) {
	AnalysedText at;
	try {
	at = AnalysedTextUtils.getAnalysedText(ci);
	}catch (RuntimeException e) {
	log.warn("Unable to retrieve AnalysedText for ContentItem "
	+ ci + "because of an "+e.getClass().getSimpleName()+" with message "
	+ e.getMessage(),e);
	at = null;
	}
	if(at != null){
	return at;
	}
	if(exception){
	throw new IllegalStateException("Unable to retrieve AnalysedText from ContentItem "
	+ ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
	+ "used EnhancementJobManager!");
	} else {
	log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
	+ "ContentItem {} because the AnalysedText ContentPart is "
	+ "missing. Users might want to add an EnhancementEngine that "
	+ "creates the AnalysedText ContentPart such as the "
	+ "POSTaggingEngine (o.a.stanbol.enhancer.engines.opennlp.pos)!",
	new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
	return null;
	}
	}

	/**
	* Getter for the language of the content
	* @param ci the ContentItem
	* @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
	* and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
	* @return the AnalysedText or <code>null</code> if not found.
	* @throws IllegalStateException if exception is <code>true</code> and the
	* language could not be retrieved from the parsed {@link ContentItem}.
	*/
	public static String getLanguage(EnhancementEngine engine, ContentItem ci, boolean exception) {
	String language = EnhancementEngineHelper.getLanguage(ci);
	if(language != null) {
	return language;
	}
	if(exception){
	throw new IllegalStateException("Unable to retrieve the detected language for ContentItem "
	+ ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
	+ "used EnhancementJobManager!");
	} else {
	log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
	+ "ContentItem {} because the langauge of "
	+ "this ContentItem is unknown. Users might want to add a "
	+ "Language Identification EnhancementEngine to the current "
	+ "EnhancementChain!",
	new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
	return null;
	}
	}
	/**
	* Used in {@link #canEnhance(ContentItem)} to check if a {@link ContentItem}
	* should be processed based on the language configuration of this engine.
	* @param engine the {@link EnhancementEngine} calling this method
	* @param languageConfiguration the language configuration
	* @param language the language
	* @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
	* and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
	* @return the state
	* @throws IllegalStateException if exception is <code>true</code> and the
	* language is not configured as beeing processed.
	*/
	public static boolean isLangaugeConfigured(EnhancementEngine engine, LanguageConfiguration languageConfiguration, String language, boolean exception){
	boolean state = languageConfiguration.isLanguage(language);
	if(!state && exception){
	throw new IllegalStateException("Language "+language+" is not included "
	+ "by the LanguageConfiguration of this engine (name "+ engine.getName()
	+ "). As this is also checked in canEnhancer this may indicate an Bug in the "
	+ "used EnhancementJobManager!");
	} else {
	return state;
	}
	}

	/**
	* Retrieves - or if not present - creates the {@link AnalysedText} content
	* part for the parsed {@link ContentItem}. If the {@link Blob} with the
	* mime type '<code>text/plain</code>' is present this method
	* throws an {@link IllegalStateException} (this method internally uses
	* {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
	* <code>true</code> as third parameters. Users of this method should call
	* this method with <code>false</code> as third parameter in their
	* {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
	* <i>NOTE:</i> This method is intended for Engines that want to create an
	* empty {@link AnalysedText} content part. Engines that assume that this
	* content part is already present (e.g. if the consume already existing
	* annotations) should use the
	* {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
	* method instead.
	* @param engine the EnhancementEngine calling this method (used for logging)
	* @param analysedTextFactory the {@link AnalysedTextFactory} used to create
	* the {@link AnalysedText} instance (if not present).
	* @param ci the {@link ContentItem}
	* @return the AnalysedText
	* @throws EngineException on any exception while accessing the
	* '<code>text/plain</code>' Blob
	* @throws IllegalStateException if no '<code>text/plain</code>' Blob is
	* present as content part of the parsed {@link ContentItem} or the parsed
	* {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that
	* {@link IllegalStateException} are only thrown if the {@link AnalysedText}
	* ContentPart is not yet present in the parsed {@link ContentItem}
	*/
	public static AnalysedText initAnalysedText(EnhancementEngine engine,
	AnalysedTextFactory analysedTextFactory,
	ContentItem ci) throws EngineException {
	AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
	if(at == null){
	if(analysedTextFactory == null){
	throw new IllegalStateException("Unable to initialise AnalysedText"
	+ "ContentPart because the parsed AnalysedTextFactory is NULL");
	}
	Entry<UriRef,Blob> textBlob = getPlainText(engine, ci, true);
	//we need to create
	ci.getLock().writeLock().lock();
	try {
	//try again to retrieve (maybe an concurrent thread has created
	//the content part in the meantime
	at = AnalysedTextUtils.getAnalysedText(ci);
	if(at == null){
	log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
	at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
	}
	} catch (IOException e) {
	throw new EngineException("Unable to create AnalysetText instance for Blob "
	+ textBlob.getKey()+ " of ContentItem "+ci.getUri()+"!",e);
	} finally {
	ci.getLock().writeLock().unlock();
	}
	} else {
	log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
	}
	return at;
	}

	/**
	* Getter for the language of the content
	* @param ci the ContentItem
	* @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
	* and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
	* @return the AnalysedText or <code>null</code> if not found.
	* @throws IllegalStateException if exception is <code>true</code> and the
	* language could not be retrieved from the parsed {@link ContentItem}.
	*/
	public static Entry<UriRef,Blob> getPlainText(EnhancementEngine engine, ContentItem ci, boolean exception) {
	Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(
	ci, singleton("text/plain"));
	if(textBlob != null) {
	return textBlob;
	}
	if(exception){
	throw new IllegalStateException("Unable to retrieve 'text/plain' ContentPart for ContentItem "
	+ ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
	+ "used EnhancementJobManager!");
	} else {
	log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
	+ "ContentItem {} because no 'text/plain' ContentPart is "
	+ "present in this ContentItem. Users that need to enhance "
	+ "non-plain-text Content need to add an EnhancementEngine "
	+ "that supports the conversion of '{}' files to plain text "
	+ "to the current EnhancementChain!",
	new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci,ci.getMimeType()});
	return null;
	}
	}
	/**
	* Parsed the {@link NlpProcessingRole} typically provided by the
	* {@link ServiceProperties#getServiceProperties()} provided by some
	* EnhancementEngines.<p>
	* This supports both {@link NlpProcessingRole} as well as String values
	* using the {@link NlpProcessingRole#name()}.
	* @param properties the properties (typically retrieved from the
	* {@link ServiceProperties#getServiceProperties()} method)
	* @return the NLP processing role or <code>null</code> if not present OR
	* an error while parsing.
	*/
	public static NlpProcessingRole getNlpProcessingRole(Map<String,Object> properties){
	Object value = properties.get(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE);
	if(value instanceof NlpProcessingRole){
	return (NlpProcessingRole)value;
	} else if(value != null){
	try {
	return NlpProcessingRole.valueOf(value.toString());
	} catch (IllegalArgumentException e) {
	log.warn("Unknown NLP processing role {} -> return null",value);
	return null;
	}
	} else {
	return null;
	}
	}
	}