blob: 767dff20bb013a23c60005707021ae9693ee56a2 [file] [log] [blame]
/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.sentiment.classifiers;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.Dictionary;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Reference;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceRegistration;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A German word classifier based on SentiWS. Reads the SentiWS positive and negative word lists and parses them
* into an appropriate hash table, so lookups should be extremely fast.
* <p/>
* @author Sebastian Schaffert
* @author Rupert Westenthaler
*/
@Component(immediate=true)
public class SentiWSComponent {
private static Logger log = LoggerFactory.getLogger(SentiWSComponent.class);
private static final Map<String,String> modelProperties = new HashMap<String,String>();
static {
modelProperties.put("Description", "Sentiment Word List (German)");
modelProperties.put("Download Location", "http://wortschatz.informatik.uni-leipzig.de/download/");
}
@Reference
private DataFileTracker dataFileProvider;
protected BundleContext bundleContext;
/**
* Registration for the {@link SentiWsClassifierDE}. Will be created as
* soon as both required data files are available and successfully loaded
*/
protected ServiceRegistration sentiWsClassifierService;
protected SentiWsClassifierDE sentiWsClassifier;
private ModelListener modelListener = new ModelListener();
protected Set<String> sentiWsFileNames = new HashSet<String>();
protected Set<String> loadedSentiWsFiles = new HashSet<String>();
public SentiWSComponent() {}
/**
* Tracks the SentiWS files and triggers the registration of the service
*/
private class ModelListener implements DataFileListener {
@Override
public boolean available(String resourceName, InputStream is) {
if(sentiWsFileNames.contains(resourceName)){
log.info("sentiWs resource {} available",resourceName);
try {
long start = System.currentTimeMillis();
if(sentiWsClassifier != null){
sentiWsClassifier.parseSentiWS(is);
loadedSentiWsFiles.add(resourceName);
log.info(" ... loaded in {} ms",(System.currentTimeMillis()-start));
}
} catch (IOException e) {
log.warn("Unable to load sentiWs resource '"+resourceName+"!",e);
return false; //keep tracking
} catch (RuntimeException e) {
log.error("RuntimeException while loading sentiWs resource '"
+resourceName+"!",e);
return false; //keep tracking
}
} else {
log.warn("Tracker notified event for non-tracked resource '{}'"
+ "(tracked: {})!",resourceName,sentiWsFileNames);
}
//all resources available ... start the service
if(loadedSentiWsFiles.equals(sentiWsFileNames)){
log.info("register Sentiment Classifier for SentiWs (german)");
registerService();
} else {
log.info("loaded {} (required: {})",loadedSentiWsFiles,sentiWsFileNames);
}
//remove registration
return true;
}
@Override
public boolean unavailable(String resourceName) {
//not used;
return false;
}
}
@Activate
protected void activate(ComponentContext ctx){
bundleContext = ctx.getBundleContext();
//TODO: make Filenames configurable
sentiWsFileNames.add("SentiWS_v1.8b_Negative.txt");
sentiWsFileNames.add("SentiWS_v1.8b_Positive.txt");
//register files with the DataFileTracker
for(String sentiWsFile : sentiWsFileNames){
dataFileProvider.add(modelListener, sentiWsFile, modelProperties);
}
sentiWsClassifier = new SentiWsClassifierDE();
}
protected void registerService() {
Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>();
serviceProperties.put("language", "de"); //set the language
BundleContext bc = bundleContext;
if(bc != null && sentiWsClassifierService == null){
sentiWsClassifierService = bc.registerService(
SentimentClassifier.class.getName(), sentiWsClassifier,
serviceProperties);
}
}
protected void deactivate(ComponentContext ctx) {
//end datafile tracking
dataFileProvider.removeAll(modelListener);
sentiWsFileNames.clear();
loadedSentiWsFiles.clear();
//remove service registration
if(sentiWsClassifierService != null){
sentiWsClassifierService.unregister();
}
//free up resources
if(sentiWsClassifier != null){
sentiWsClassifier.close();
sentiWsClassifier = null;
}
bundleContext = null;
}
/**
* The OSGI service registered as soon as the required DataFiles are
* available
*/
public static class SentiWsClassifierDE extends LexicalCategoryClassifier implements SentimentClassifier {
private WordSentimentDictionary dict = new WordSentimentDictionary(Locale.GERMAN);
protected SentiWsClassifierDE(){}
protected void parseSentiWS(InputStream is) throws IOException {
log.debug("parsing SentiWS word lists ...");
BufferedReader in = new BufferedReader(new InputStreamReader(is));
try {
for(String line = in.readLine(); line != null; line = in.readLine()) {
// input file will have a space- or tab-separated list per line:
// - first component is the main word with a specification what kind of word it is (can be result from POS tagging)
// - second component is the positive or negative sentiment associated with the word
// - third argument is a comma-separated list of deflections of the word as they might also occur in text
String[] components = line.split("\\s");
// parse the weight
Double weight = Double.valueOf(components[1]);
// get the main word
String[] wordPart = components[0].split("\\|");
String mainWord = wordPart[0];
LexicalCategory cat = getLexicalCategory(wordPart[1]);
dict.updateSentiment(cat, mainWord, weight);
// get the remaining words (deflections)
if(components.length > 2) {
for(String word : components[2].split(",")) {
dict.updateSentiment(cat, word, weight);
}
}
}
} finally {
IOUtils.closeQuietly(in);
}
}
private LexicalCategory getLexicalCategory(String posTag){
char c = posTag.charAt(0);
switch (c) {
case 'N':
return LexicalCategory.Noun;
case 'V':
return LexicalCategory.Verb;
case 'A':
return LexicalCategory.Adjective;
default: //TODO: change this to a warning and return NULL
throw new IllegalStateException("Unsupported posTag '"+posTag+"'!");
}
}
@Override
public String getLanguage() {
return "de";
}
/**
* Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value from
* very negative to very positive. Unknown words should return the value 0.
*
* @param word
* @return
*/
@Override
public double classifyWord(LexicalCategory cat, String word) {
Double sentiment = dict.getSentiment(cat, word);
return sentiment != null ? sentiment.doubleValue() : 0.0;
}
/**
* Internally used to free up resources when the service is
* unregistered
*/
protected void close(){
dict.clear();
}
}
}