blob: 62ad2937361236af8bc7305c5c09eefe0ae5c2ca [file] [log] [blame]
/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.sentiment.services;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.SENTIMENT_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.ReferenceStrategy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A Stanbol engine that associates sentiment values with the tokens created by the POS tagging engine.
* Sentiment values are added to the POSContentPart of the content item and can by further analysed by other
* engines, e.g. to compute sentiment values for the whole content item or in relation to certain nouns.
* <p/>
* The configuration allows specifying whether to analyse all words or only adjectives and nouns (a typical case).
* <p/>
* Currently, sentiment analysis is available for English and for German language. It uses the following word lists:
* <ul>
* <li>English: SentiWordNet (http://wordnet.princeton.edu/), license allows commercial use</li>
* <li>German: SentiWS (http://wortschatz.informatik.uni-leipzig.de/download/), license does NOT allow commercial use</li>
* </ul>
* <p/>
* Author: Sebastian Schaffert
*/
@Component(immediate = true, metatype = true,
configurationFactory = true, //allow multiple instances
policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
@Service
@Properties(value={
@Property(name= EnhancementEngine.PROPERTY_NAME,value="sentiment-wordclassifier"),
@Property(name=SentimentEngine.CONFIG_LANGUAGES,value={SentimentEngine.DEFAULT_LANGUAGE_CONFIG}),
@Property(name=SentimentEngine.CONFIG_ADJECTIVES,
boolValue=SentimentEngine.DEFAULT_PROCESS_ADJECTIVES_ONLY),
@Property(name=SentimentEngine.CONFIG_MIN_POS_CONFIDENCE,
doubleValue = SentimentEngine.DEFAULT_MIN_POS_CONFIDNECE),
@Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
})
public class SentimentEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
/**
* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
* are the languages given as default value.
*/
public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentiment.languages";
/**
* When set to true, only adjectives and nouns will be considered in sentiment analysis.
*/
public static final String CONFIG_ADJECTIVES = "org.apache.stanbol.enhancer.sentiment.adjectives";
/**
* POS tags that are not selected by {@link SentimentClassifier#isAdjective(PosTag)}
* or {@link SentimentClassifier#isNoun(PosTag)} are ignored if their confidence
* is &gt= the configured values. If there are multiple POS tag suggestions,
* that Words that do have a suitable TAG are still considered if the
* confidence of the fitting tag is &gt;= {min-pos-confidence}/2
*/
public static final String CONFIG_MIN_POS_CONFIDENCE = "org.apache.stanbol.enhancer.sentiment.min-pos-confidence";
boolean debugSentiments;
public static final String DEFAULT_LANGUAGE_CONFIG = "*";
private LanguageConfiguration langaugeConfig =
new LanguageConfiguration(CONFIG_LANGUAGES, new String[]{DEFAULT_LANGUAGE_CONFIG});
/**
* The minimum confidence of POS tags so that a token is NOT processed if
* the {@link LexicalCategory} is NOT {@link LexicalCategory#Adjective} (or
* {@link LexicalCategory#Noun Noun} if {@link #CONFIG_ADJECTIVES} is
* deactivated) - default: 0.8<p>
*/
public static final double DEFAULT_MIN_POS_CONFIDNECE = 0.8;
public static final boolean DEFAULT_PROCESS_ADJECTIVES_ONLY = false;
/**
* Service Properties used by this Engine
*/
private static final Map<String,Object> SERVICE_PROPERTIES;
static {
Map<String,Object> props = new HashMap<String,Object>();
props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
ServiceProperties.ORDERING_NLP_POS - 1); //after POS tagging
props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
NlpProcessingRole.SentimentTagging);
SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
}
private static Logger log = LoggerFactory.getLogger(SentimentEngine.class);
/**
* {@link SentimentClassifier} are now OSGI services and injected via events
* (calls to {@link #bindClassifier(SentimentClassifier)} and
* {@link #unbindClassifier(SentimentClassifier)}) as soon as they become
* (un)available.
*/
@Reference(referenceInterface=SentimentClassifier.class,
cardinality=ReferenceCardinality.OPTIONAL_MULTIPLE,
bind="bindClassifier",
unbind="unbindClassifier",
policy=ReferencePolicy.DYNAMIC,
strategy=ReferenceStrategy.EVENT)
private Map<String,SentimentClassifier> classifiers = Collections.synchronizedMap(
new HashMap<String,SentimentClassifier>());
/** bind method for {@link #classifiers} */
protected void bindClassifier(SentimentClassifier classifier){
log.info(" ... bind Sentiment Classifier {} for language {}",
classifier.getClass().getSimpleName(),classifier.getLanguage());
synchronized (classifiers) {
SentimentClassifier old = classifiers.put(classifier.getLanguage(), classifier);
if(old != null){
log.warn("Replaced Sentiment Classifier for language {} (old: {}, new: {}",
new Object[]{old.getLanguage(),old,classifier});
}
}
}
/** unbind method for {@link #classifiers} */
protected void unbindClassifier(SentimentClassifier classifier){
String lang = classifier.getLanguage();
synchronized (classifiers) {
SentimentClassifier current = classifiers.remove(lang);
if(!classifier.equals(current) //the current is not the parsed one
&& current != null){
classifiers.put(lang,current); //re-add the value
} else {
log.info(" ... unbind Sentiment Classifier {} for language {}",
classifier.getClass().getSimpleName(),lang);
}
}
}
/**
* The processed {@link LexicalCategory LexicalCategories}.
*/
boolean adjectivesOnly = DEFAULT_PROCESS_ADJECTIVES_ONLY;
/**
* The minimum {@link PosTag} value {@link Value#probability() confidence}.<p>
* This means that if the {@link Value#probability() confidence} of a
* {@link NlpAnnotations#POS_ANNOTATION}s (returned by
* {@link Token#getAnnotations(Annotation)}) is greater than
* {@link #minPOSConfidence} that the result of
* {@link SentimentClassifier#isAdjective(PosTag)} (and
* {@link SentimentClassifier#isNoun(PosTag)} - if #CONFIG_ADJECTIVES is
* deactivated) is used to decide if a Token needs to be processed or not.
* Otherwise further {@link NlpAnnotations#POS_ANNOTATION}s are analysed for
* processable POS tags. Processable POS tags are accepted until
* <code>{@link #minPOSConfidence}/2</code>.
*/
private double minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
/**
* Indicate if this engine can enhance supplied ContentItem, and if it
* suggests enhancing it synchronously or asynchronously. The
* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
* just a suggestion from the engine.
* <p/>
* Returns {@link EnhancementEngine}#ENHANCE_ASYNC if <ul>
* <li> the {@link AnalysedText} content part is present
* <li> the language of the content is known
* <li> the language is active based on the language configuration and
* <li> a sentiment classifier is available for the language
* </ul>
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the introspecting process of the content item
* fails
*/
@Override
public int canEnhance(ContentItem ci) throws EngineException {
if(getAnalysedText(this,ci, false) == null){
return CANNOT_ENHANCE;
}
String language = getLanguage(this, ci,false);
if(language == null) {
return CANNOT_ENHANCE;
}
if(classifiers.containsKey(language)){
return ENHANCE_ASYNC;
} else {
return CANNOT_ENHANCE;
}
}
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText analysedText = getAnalysedText(this,ci, true);
String language = getLanguage(this, ci, true);
SentimentClassifier classifier = classifiers.get(language);
if(classifier == null){
throw new IllegalStateException("Sentiment Classifier for language '"
+ language +"' not available. As this is also checked in "
+ " canEnhance this may indicate an Bug in the used "
+ "EnhancementJobManager!");
}
//TODO: locking for AnalysedText not yet defined
// ci.getLock().writeLock().lock();
// try {
Iterator<Token> tokens = analysedText.getTokens();
while(tokens.hasNext()){
Token token = tokens.next();
Set<LexicalCategory> cats = null;
boolean process = false;
if(!adjectivesOnly){
process = true;
Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if(posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY
|| posTag.probability() >= (minPOSConfidence/2.0)){
cats = classifier.getCategories(posTag.value());
} else { //no POS tags or probability to low
cats = Collections.emptySet();
}
} else { //check PosTags if we need to lookup this word
Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
boolean ignore = false;
while(!ignore && !process && posTags.hasNext()) {
Value<PosTag> value = posTags.next();
PosTag tag = value.value();
cats = classifier.getCategories(tag);
boolean state = cats.contains(LexicalCategory.Adjective)
|| cats.contains(LexicalCategory.Noun);
ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY ||
value.probability() >= minPOSConfidence);
process = state && (value.probability() == Value.UNKNOWN_PROBABILITY ||
value.probability() >= (minPOSConfidence/2.0));
}
} //else process all tokens ... no POS tag checking needed
if(process){
String word = token.getSpan();
double sentiment = 0.0;
if(cats.isEmpty()){
sentiment = classifier.classifyWord(null, word);
} else { //in case of multiple Lexical Cats
//we build the average over NOT NULL sentiments for the word
int catSentNum = 0;
for(LexicalCategory cat : cats){
double catSent = classifier.classifyWord(cat, word);
if(catSent != 0.0){
catSentNum++;
sentiment = sentiment + catSent;
}
}
if(catSentNum > 0){
sentiment = sentiment / (double) catSentNum;
}
}
if(sentiment != 0.0){
token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment));
} //else do not set sentiments with 0.0
} // else do not process
}
// } finally {
// ci.getLock().writeLock().unlock();
// }
}
/**
* Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
* CONFIG_LANGUAGES.
*
* @param ce the {@link org.osgi.service.component.ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
log.info("activating POS tagging engine");
super.activate(ce);
@SuppressWarnings("unchecked")
Dictionary<String, Object> properties = ce.getProperties();
//parse the configured languages
langaugeConfig.setConfiguration(properties);
//set the processed lexical categories
Object value = properties.get(CONFIG_ADJECTIVES);
adjectivesOnly = value instanceof Boolean ? (Boolean)value :
value != null ? Boolean.parseBoolean(value.toString()) :
DEFAULT_PROCESS_ADJECTIVES_ONLY;
//set minimum POS confidence
value = properties.get(CONFIG_MIN_POS_CONFIDENCE);
if(value instanceof Number){
minPOSConfidence = ((Number)value).doubleValue();
} else if(value != null){
try {
minPOSConfidence = Double.parseDouble(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE,
"Unable to parsed minimum POS confidence value from '"
+ value +"'!",e);
}
} else {
minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE;
}
if(minPOSConfidence <= 0 || minPOSConfidence >= 1){
throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE,
"The configured minimum POS confidence value '"
+minPOSConfidence+"' MUST BE > 0 and < 1!");
}
}
@Deactivate
protected void deactivate(ComponentContext ctx){
//remove remaining classifiers
this.classifiers.clear();
langaugeConfig.setDefault();
super.deactivate(ctx);
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
}