| /* |
| * Copyright (c) 2012 Sebastian Schaffert |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.stanbol.enhancer.engines.sentiment.classifiers; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.Dictionary; |
| import java.util.HashMap; |
| import java.util.Hashtable; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.TreeMap; |
| import java.util.concurrent.locks.ReadWriteLock; |
| import java.util.concurrent.locks.ReentrantReadWriteLock; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.felix.scr.annotations.Activate; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.Deactivate; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.lucene.analysis.en.EnglishMinimalStemmer; |
| import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener; |
| import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker; |
| import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier; |
| import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier; |
| import org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary; |
| import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; |
| import org.osgi.framework.BundleContext; |
| import org.osgi.framework.ServiceRegistration; |
| import org.osgi.service.component.ComponentContext; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * A word classifier for the english language based on SentiWordNet. Reads in a SentiWordNet file and |
| * represents mappings from word to sentiment score between -1 and 1 in a hashmap. |
| * <p/> |
| * Future versions might make use of a disk-based storage of the hashmap to improve memory performance. |
| * <p/> |
| * Note that a license for SentiWordNet is required if you intend to use the classifier in commercial |
| * settings. |
| * <p/> |
| * @author Sebastian Schaffert |
| * @autor Rupert Westenthaler |
| */ |
| @Component(immediate = true) |
| public class SentiWordNet { |
| |
| private static final Map<String,String> modelProperties = new HashMap<String,String>(); |
| static { |
| modelProperties.put("Description", "Sentiment Word List (German)"); |
| modelProperties.put("Download Location", "http://wordnet.princeton.edu/"); |
| } |
| private static Logger log = LoggerFactory.getLogger(SentiWordNet.class); |
| |
| private static final String SENTIWORDNET_RESOURCE = "SentiWordNet_3.0.0_20120206.txt"; |
| |
| protected String sentiWordNetFile; |
| |
| private ModelListener modelListener = new ModelListener(); |
| |
| @Reference |
| private DataFileTracker dataFileTracker; |
| |
| private BundleContext bundleContext; |
| |
| protected SentiWordNetClassifierEN classifier; |
| |
| protected ServiceRegistration classifierRegistration; |
| |
| public SentiWordNet() {} |
| |
| @Activate |
| protected void activate(ComponentContext ctx){ |
| bundleContext = ctx.getBundleContext(); |
| //TODO: make configurable |
| sentiWordNetFile = SENTIWORDNET_RESOURCE; |
| |
| classifier = new SentiWordNetClassifierEN(); |
| |
| dataFileTracker.add(modelListener, sentiWordNetFile, modelProperties); |
| } |
| |
| @Deactivate |
| protected void deactivate(ComponentContext ctx){ |
| if(classifierRegistration != null){ |
| classifierRegistration.unregister(); |
| classifierRegistration = null; |
| } |
| if(classifier != null){ |
| classifier.close(); |
| classifier = null; |
| } |
| dataFileTracker.removeAll(modelListener); |
| sentiWordNetFile = null; |
| } |
| |
| /** |
| * Tracks the SentiWS files and triggers the registration of the service |
| */ |
| private class ModelListener implements DataFileListener { |
| |
| @Override |
| public boolean available(String resourceName, InputStream is) { |
| if(sentiWordNetFile.equals(resourceName)){ |
| log.info("{} resource available",resourceName); |
| try { |
| long start = System.currentTimeMillis(); |
| if(classifier != null){ |
| classifier.parseSentiWordNet(is); |
| log.info(" ... loaded in {} ms",(System.currentTimeMillis()-start)); |
| registerService(); //register the service |
| } |
| } catch (IOException e) { |
| log.warn("Unable to load '"+resourceName+"'!",e); |
| return false; //keep tracking |
| } catch (RuntimeException e) { |
| log.error("RuntimeException while loading '" |
| +resourceName+"!",e); |
| return false; //keep tracking |
| } |
| } else { |
| log.warn("Tracker notified event for non-tracked resource '{}'" |
| + "(tracked: {})!",resourceName,sentiWordNetFile); |
| } |
| //remove registration |
| return true; |
| } |
| |
| @Override |
| public boolean unavailable(String resourceName) { |
| //not used; |
| return false; |
| } |
| |
| } |
| |
| protected void registerService() { |
| Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>(); |
| serviceProperties.put("language", "en"); //set the language |
| BundleContext bc = bundleContext; |
| if(bc != null){ |
| classifierRegistration = bc.registerService( |
| SentimentClassifier.class.getName(), classifier, |
| serviceProperties); |
| } |
| } |
| /** |
| * The OSGI service registered as soon as the required DataFiles are |
| * available |
| */ |
| public static class SentiWordNetClassifierEN extends LexicalCategoryClassifier implements SentimentClassifier { |
| |
| WordSentimentDictionary dict = new WordSentimentDictionary(Locale.ENGLISH); |
| |
| private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer(); |
| |
| protected SentiWordNetClassifierEN() {} |
| |
| protected void parseSentiWordNet(InputStream is) throws IOException { |
| BufferedReader in = new BufferedReader(new InputStreamReader(is)); |
| try { |
| // read line by line: |
| // - lines starting with # are ignored |
| // - valid lines have the format POS ID POSSCORE NEGSCORE SYNONYMS GLOSS separated by tags |
| for (String line = in.readLine(); line != null; line = in.readLine()) { |
| line = line.trim(); |
| if (line.length() > 0 && line.charAt(0) != '#') { |
| String[] components = line.split("\t"); |
| |
| try { |
| LexicalCategory cat = parseLexCat(components[0]); |
| double posScore = Double.parseDouble(components[2]); |
| double negScore = Double.parseDouble(components[3]); |
| String synonyms = components[4]; |
| |
| Double score = posScore - negScore; |
| |
| if (score != 0.0) { |
| for (String synonymToken : synonyms.split(" ")) { |
| // synonymTokens are of the form word#position, so we strip out the position |
| // part |
| String[] synonym = synonymToken.split("#"); |
| String stemmed = getStemmed(synonym[0]); |
| dict.updateSentiment(cat, stemmed, score); |
| } |
| } |
| |
| } catch (RuntimeException ex) { |
| log.warn("could not parse SentiWordNet line '{}': {}", line, ex.getMessage()); |
| } |
| } |
| } |
| } finally { |
| IOUtils.closeQuietly(in); |
| } |
| } |
| |
| private LexicalCategory parseLexCat(String val) { |
| switch (val.charAt(0)) { |
| case 'a': |
| return LexicalCategory.Adjective; |
| case 'v': |
| return LexicalCategory.Verb; |
| case 'n': |
| return LexicalCategory.Noun; |
| case 'r': |
| return LexicalCategory.Adverb; |
| default: |
| throw new IllegalStateException("Uncown POS tag '"+val+"'!"); |
| } |
| } |
| |
| |
| /** |
| * Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value |
| * from very negative to very positive. Unknown words should return the value 0. |
| * |
| * @param word |
| * @return |
| */ |
| @Override |
| public double classifyWord(LexicalCategory cat, String word) { |
| Double sentiment = dict.getSentiment(cat, getStemmed(word)); |
| return sentiment != null ? sentiment.doubleValue() : 0.0; |
| } |
| |
| private String getStemmed(String word) { |
| return word.substring(0, stemmer.stem(word.toCharArray(), word.length())); |
| } |
| |
| @Override |
| public String getLanguage() { |
| return "en"; |
| } |
| |
| protected void close(){ |
| dict.clear(); |
| } |
| } |
| } |