blob: e2d065b4dd496581991fda6ccc41ae0f541592c1 [file] [log] [blame]
/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.sentiment.classifiers;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Reference;
import org.apache.lucene.analysis.en.EnglishMinimalStemmer;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceRegistration;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A word classifier for the english language based on SentiWordNet. Reads in a SentiWordNet file and
* represents mappings from word to sentiment score between -1 and 1 in a hashmap.
* <p/>
* Future versions might make use of a disk-based storage of the hashmap to improve memory performance.
* <p/>
* Note that a license for SentiWordNet is required if you intend to use the classifier in commercial
* settings.
* <p/>
* @author Sebastian Schaffert
* @autor Rupert Westenthaler
*/
@Component(immediate = true)
public class SentiWordNet {
private static final Map<String,String> modelProperties = new HashMap<String,String>();
static {
modelProperties.put("Description", "Sentiment Word List (German)");
modelProperties.put("Download Location", "http://wordnet.princeton.edu/");
}
private static Logger log = LoggerFactory.getLogger(SentiWordNet.class);
private static final String SENTIWORDNET_RESOURCE = "SentiWordNet_3.0.0_20120206.txt";
protected String sentiWordNetFile;
private ModelListener modelListener = new ModelListener();
@Reference
private DataFileTracker dataFileTracker;
private BundleContext bundleContext;
protected SentiWordNetClassifierEN classifier;
protected ServiceRegistration classifierRegistration;
public SentiWordNet() {}
@Activate
protected void activate(ComponentContext ctx){
bundleContext = ctx.getBundleContext();
//TODO: make configurable
sentiWordNetFile = SENTIWORDNET_RESOURCE;
classifier = new SentiWordNetClassifierEN();
dataFileTracker.add(modelListener, sentiWordNetFile, modelProperties);
}
@Deactivate
protected void deactivate(ComponentContext ctx){
if(classifierRegistration != null){
classifierRegistration.unregister();
classifierRegistration = null;
}
if(classifier != null){
classifier.close();
classifier = null;
}
dataFileTracker.removeAll(modelListener);
sentiWordNetFile = null;
}
/**
* Tracks the SentiWS files and triggers the registration of the service
*/
private class ModelListener implements DataFileListener {
@Override
public boolean available(String resourceName, InputStream is) {
if(sentiWordNetFile.equals(resourceName)){
log.info("{} resource available",resourceName);
try {
long start = System.currentTimeMillis();
if(classifier != null){
classifier.parseSentiWordNet(is);
log.info(" ... loaded in {} ms",(System.currentTimeMillis()-start));
registerService(); //register the service
}
} catch (IOException e) {
log.warn("Unable to load '"+resourceName+"'!",e);
return false; //keep tracking
} catch (RuntimeException e) {
log.error("RuntimeException while loading '"
+resourceName+"!",e);
return false; //keep tracking
}
} else {
log.warn("Tracker notified event for non-tracked resource '{}'"
+ "(tracked: {})!",resourceName,sentiWordNetFile);
}
//remove registration
return true;
}
@Override
public boolean unavailable(String resourceName) {
//not used;
return false;
}
}
protected void registerService() {
Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>();
serviceProperties.put("language", "en"); //set the language
BundleContext bc = bundleContext;
if(bc != null){
classifierRegistration = bc.registerService(
SentimentClassifier.class.getName(), classifier,
serviceProperties);
}
}
/**
* The OSGI service registered as soon as the required DataFiles are
* available
*/
public static class SentiWordNetClassifierEN extends LexicalCategoryClassifier implements SentimentClassifier {
WordSentimentDictionary dict = new WordSentimentDictionary(Locale.ENGLISH);
private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
protected SentiWordNetClassifierEN() {}
protected void parseSentiWordNet(InputStream is) throws IOException {
BufferedReader in = new BufferedReader(new InputStreamReader(is));
try {
// read line by line:
// - lines starting with # are ignored
// - valid lines have the format POS ID POSSCORE NEGSCORE SYNONYMS GLOSS separated by tags
for (String line = in.readLine(); line != null; line = in.readLine()) {
line = line.trim();
if (line.length() > 0 && line.charAt(0) != '#') {
String[] components = line.split("\t");
try {
LexicalCategory cat = parseLexCat(components[0]);
double posScore = Double.parseDouble(components[2]);
double negScore = Double.parseDouble(components[3]);
String synonyms = components[4];
Double score = posScore - negScore;
if (score != 0.0) {
for (String synonymToken : synonyms.split(" ")) {
// synonymTokens are of the form word#position, so we strip out the position
// part
String[] synonym = synonymToken.split("#");
String stemmed = getStemmed(synonym[0]);
dict.updateSentiment(cat, stemmed, score);
}
}
} catch (RuntimeException ex) {
log.warn("could not parse SentiWordNet line '{}': {}", line, ex.getMessage());
}
}
}
} finally {
IOUtils.closeQuietly(in);
}
}
private LexicalCategory parseLexCat(String val) {
switch (val.charAt(0)) {
case 'a':
return LexicalCategory.Adjective;
case 'v':
return LexicalCategory.Verb;
case 'n':
return LexicalCategory.Noun;
case 'r':
return LexicalCategory.Adverb;
default:
throw new IllegalStateException("Uncown POS tag '"+val+"'!");
}
}
/**
* Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value
* from very negative to very positive. Unknown words should return the value 0.
*
* @param word
* @return
*/
@Override
public double classifyWord(LexicalCategory cat, String word) {
Double sentiment = dict.getSentiment(cat, getStemmed(word));
return sentiment != null ? sentiment.doubleValue() : 0.0;
}
private String getStemmed(String word) {
return word.substring(0, stemmer.stem(word.toCharArray(), word.length()));
}
@Override
public String getLanguage() {
return "en";
}
protected void close(){
dict.clear();
}
}
}