| /* |
| * Copyright (c) 2012 Sebastian Schaffert |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.stanbol.enhancer.engines.sentiment.classifiers; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.util.Collections; |
| import java.util.Dictionary; |
| import java.util.EnumMap; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Hashtable; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.concurrent.locks.ReadWriteLock; |
| import java.util.concurrent.locks.ReentrantReadWriteLock; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.felix.scr.annotations.Activate; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileListener; |
| import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker; |
| import org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier; |
| import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier; |
| import org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary; |
| import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; |
| import org.osgi.framework.BundleContext; |
| import org.osgi.framework.ServiceRegistration; |
| import org.osgi.service.component.ComponentContext; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| /** |
| * A German word classifier based on SentiWS. Reads the SentiWS positive and negative word lists and parses them |
| * into an appropriate hash table, so lookups should be extremely fast. |
| * <p/> |
| * @author Sebastian Schaffert |
| * @author Rupert Westenthaler |
| */ |
| @Component(immediate=true) |
| public class SentiWSComponent { |
| |
| private static Logger log = LoggerFactory.getLogger(SentiWSComponent.class); |
| private static final Map<String,String> modelProperties = new HashMap<String,String>(); |
| static { |
| modelProperties.put("Description", "Sentiment Word List (German)"); |
| modelProperties.put("Download Location", "http://wortschatz.informatik.uni-leipzig.de/download/"); |
| } |
| @Reference |
| private DataFileTracker dataFileProvider; |
| |
| protected BundleContext bundleContext; |
| |
| /** |
| * Registration for the {@link SentiWsClassifierDE}. Will be created as |
| * soon as both required data files are available and successfully loaded |
| */ |
| protected ServiceRegistration sentiWsClassifierService; |
| protected SentiWsClassifierDE sentiWsClassifier; |
| |
| |
| private ModelListener modelListener = new ModelListener(); |
| |
| protected Set<String> sentiWsFileNames = new HashSet<String>(); |
| protected Set<String> loadedSentiWsFiles = new HashSet<String>(); |
| |
| |
| public SentiWSComponent() {} |
| |
| /** |
| * Tracks the SentiWS files and triggers the registration of the service |
| */ |
| private class ModelListener implements DataFileListener { |
| |
| @Override |
| public boolean available(String resourceName, InputStream is) { |
| if(sentiWsFileNames.contains(resourceName)){ |
| log.info("sentiWs resource {} available",resourceName); |
| try { |
| long start = System.currentTimeMillis(); |
| if(sentiWsClassifier != null){ |
| sentiWsClassifier.parseSentiWS(is); |
| loadedSentiWsFiles.add(resourceName); |
| log.info(" ... loaded in {} ms",(System.currentTimeMillis()-start)); |
| } |
| } catch (IOException e) { |
| log.warn("Unable to load sentiWs resource '"+resourceName+"!",e); |
| return false; //keep tracking |
| } catch (RuntimeException e) { |
| log.error("RuntimeException while loading sentiWs resource '" |
| +resourceName+"!",e); |
| return false; //keep tracking |
| } |
| } else { |
| log.warn("Tracker notified event for non-tracked resource '{}'" |
| + "(tracked: {})!",resourceName,sentiWsFileNames); |
| } |
| //all resources available ... start the service |
| if(loadedSentiWsFiles.equals(sentiWsFileNames)){ |
| log.info("register Sentiment Classifier for SentiWs (german)"); |
| registerService(); |
| } else { |
| log.info("loaded {} (required: {})",loadedSentiWsFiles,sentiWsFileNames); |
| } |
| //remove registration |
| return true; |
| } |
| |
| @Override |
| public boolean unavailable(String resourceName) { |
| //not used; |
| return false; |
| } |
| |
| } |
| |
| @Activate |
| protected void activate(ComponentContext ctx){ |
| bundleContext = ctx.getBundleContext(); |
| //TODO: make Filenames configurable |
| sentiWsFileNames.add("SentiWS_v1.8b_Negative.txt"); |
| sentiWsFileNames.add("SentiWS_v1.8b_Positive.txt"); |
| |
| //register files with the DataFileTracker |
| for(String sentiWsFile : sentiWsFileNames){ |
| dataFileProvider.add(modelListener, sentiWsFile, modelProperties); |
| } |
| sentiWsClassifier = new SentiWsClassifierDE(); |
| } |
| |
| protected void registerService() { |
| Dictionary<String,Object> serviceProperties = new Hashtable<String,Object>(); |
| serviceProperties.put("language", "de"); //set the language |
| BundleContext bc = bundleContext; |
| if(bc != null && sentiWsClassifierService == null){ |
| sentiWsClassifierService = bc.registerService( |
| SentimentClassifier.class.getName(), sentiWsClassifier, |
| serviceProperties); |
| } |
| |
| } |
| |
| protected void deactivate(ComponentContext ctx) { |
| //end datafile tracking |
| dataFileProvider.removeAll(modelListener); |
| sentiWsFileNames.clear(); |
| loadedSentiWsFiles.clear(); |
| //remove service registration |
| if(sentiWsClassifierService != null){ |
| sentiWsClassifierService.unregister(); |
| } |
| //free up resources |
| if(sentiWsClassifier != null){ |
| sentiWsClassifier.close(); |
| sentiWsClassifier = null; |
| } |
| bundleContext = null; |
| } |
| |
| /** |
| * The OSGI service registered as soon as the required DataFiles are |
| * available |
| */ |
| public static class SentiWsClassifierDE extends LexicalCategoryClassifier implements SentimentClassifier { |
| |
| private WordSentimentDictionary dict = new WordSentimentDictionary(Locale.GERMAN); |
| |
| protected SentiWsClassifierDE(){} |
| |
| protected void parseSentiWS(InputStream is) throws IOException { |
| log.debug("parsing SentiWS word lists ..."); |
| BufferedReader in = new BufferedReader(new InputStreamReader(is)); |
| try { |
| for(String line = in.readLine(); line != null; line = in.readLine()) { |
| // input file will have a space- or tab-separated list per line: |
| // - first component is the main word with a specification what kind of word it is (can be result from POS tagging) |
| // - second component is the positive or negative sentiment associated with the word |
| // - third argument is a comma-separated list of deflections of the word as they might also occur in text |
| String[] components = line.split("\\s"); |
| |
| // parse the weight |
| Double weight = Double.valueOf(components[1]); |
| |
| // get the main word |
| String[] wordPart = components[0].split("\\|"); |
| String mainWord = wordPart[0]; |
| LexicalCategory cat = getLexicalCategory(wordPart[1]); |
| dict.updateSentiment(cat, mainWord, weight); |
| |
| // get the remaining words (deflections) |
| if(components.length > 2) { |
| for(String word : components[2].split(",")) { |
| dict.updateSentiment(cat, word, weight); |
| } |
| } |
| } |
| } finally { |
| IOUtils.closeQuietly(in); |
| } |
| } |
| |
| private LexicalCategory getLexicalCategory(String posTag){ |
| char c = posTag.charAt(0); |
| switch (c) { |
| case 'N': |
| return LexicalCategory.Noun; |
| case 'V': |
| return LexicalCategory.Verb; |
| case 'A': |
| return LexicalCategory.Adjective; |
| default: //TODO: change this to a warning and return NULL |
| throw new IllegalStateException("Unsupported posTag '"+posTag+"'!"); |
| } |
| } |
| |
| @Override |
| public String getLanguage() { |
| return "de"; |
| } |
| |
| /** |
| * Given the word passed as argument, return a value between -1 and 1 indicating its sentiment value from |
| * very negative to very positive. Unknown words should return the value 0. |
| * |
| * @param word |
| * @return |
| */ |
| @Override |
| public double classifyWord(LexicalCategory cat, String word) { |
| Double sentiment = dict.getSentiment(cat, word); |
| return sentiment != null ? sentiment.doubleValue() : 0.0; |
| } |
| /** |
| * Internally used to free up resources when the service is |
| * unregistered |
| */ |
| protected void close(){ |
| dict.clear(); |
| } |
| } |
| |
| } |