blob: 44a3640d1c7cfb6f13382dd84d2dabef7bf81b89 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.opinion_processor;
import java.io.IOException;
import java.util.List;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.Iterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.logging.Logger;
import org.ejml.simple.SimpleMatrix;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.LabeledWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordLemmaTag;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.SentimentAnnotatedTree;
import edu.stanford.nlp.sentiment.SentimentUtils;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;
public class DefaultSentimentProcessor {
/** A logger for this class */
private static final Logger log = Logger
.getLogger("opennlp.tools.parse_thicket.opinion_processor.DefaultSentimentProcessor");
private static final NumberFormat NF = new DecimalFormat("0.0000");
enum Output {
PENNTREES, VECTORS, ROOT, PROBABILITIES
}
enum Input {
TEXT, TREES
}
/**
* Sets the labels on the tree (except the leaves) to be the integer
* value of the sentiment prediction. Makes it easy to print out
* with Tree.toString()
*/
static void setSentimentLabels(Tree tree) {
if (tree.isLeaf()) {
return;
}
for (Tree child : tree.children()) {
setSentimentLabels(child);
}
Label label = tree.label();
if (!(label instanceof CoreLabel)) {
throw new IllegalArgumentException("Required a tree with CoreLabels");
}
CoreLabel cl = (CoreLabel) label;
cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree)));
}
/**
* Sets the labels on the tree to be the indices of the nodes.
* Starts counting at the root and does a postorder traversal.
*/
static int setIndexLabels(Tree tree, int index) {
if (tree.isLeaf()) {
return index;
}
tree.label().setValue(Integer.toString(index));
index++;
for (Tree child : tree.children()) {
index = setIndexLabels(child, index);
}
return index;
}
/**
* Outputs the vectors from the tree. Counts the tree nodes the
* same as setIndexLabels.
*/
static int outputTreeVectors(PrintStream out, Tree tree, int index) {
if (tree.isLeaf()) {
return index;
}
out.print(" " + index + ":");
SimpleMatrix vector = RNNCoreAnnotations.getNodeVector(tree);
for (int i = 0; i < vector.getNumElements(); ++i) {
out.print(" " + NF.format(vector.get(i)));
}
out.println();
index++;
for (Tree child : tree.children()) {
index = outputTreeVectors(out, child, index);
}
return index;
}
/**
* Outputs the scores from the tree. Counts the tree nodes the
* same as setIndexLabels.
*/
static int outputTreeScores(PrintStream out, Tree tree, int index) {
if (tree.isLeaf()) {
return index;
}
out.print(" " + index + ":");
SimpleMatrix vector = RNNCoreAnnotations.getPredictions(tree);
for (int i = 0; i < vector.getNumElements(); ++i) {
out.print(" " + NF.format(vector.get(i)));
}
out.println();
index++;
for (Tree child : tree.children()) {
index = outputTreeScores(out, child, index);
}
return index;
}
public static <T> String wordToString(T o, final boolean justValue) {
return wordToString(o, justValue, null);
}
public static <T> String wordToString(T o, final boolean justValue,
final String separator) {
if (justValue && o instanceof Label) {
if (o instanceof CoreLabel) {
CoreLabel l = (CoreLabel) o;
String w = l.value();
if (w == null)
w = l.word();
return w;
} else {
return (((Label) o).value());
}
} else if (o instanceof CoreLabel) {
CoreLabel l = ((CoreLabel) o);
String w = l.value();
if (w == null)
w = l.word();
if (l.tag() != null) {
if (separator == null) {
return w + CoreLabel.TAG_SEPARATOR + l.tag();
} else {
return w + separator + l.tag();
}
}
return w;
// an interface that covered these next four cases would be
// nice, but we're moving away from these data types anyway
} else if (separator != null && o instanceof TaggedWord) {
return ((TaggedWord) o).toString(separator);
} else if (separator != null && o instanceof LabeledWord) {
return ((LabeledWord) o).toString();
} else if (separator != null && o instanceof WordLemmaTag) {
return ((WordLemmaTag) o).toString(separator);
} else if (separator != null && o instanceof WordTag) {
return ((WordTag) o).toString(separator);
} else {
return (o.toString());
}
}
/**
* Returns the sentence as a string with a space between words.
* It prints out the {@code value()} of each item -
* this will give the expected answer for a short form representation
* of the "sentence" over a range of cases. It is equivalent to
* calling {@code toString(true)}.
*
* TODO: Sentence used to be a subclass of ArrayList, with this
* method as the toString. Therefore, there may be instances of
* ArrayList being printed that expect this method to be used.
*
* @param list The tokenized sentence to print out
* @return The tokenized sentence as a String
*/
public static <T> String listToString(List<T> list) {
return listToString(list, true);
}
/**
* Returns the sentence as a string with a space between words.
* Designed to work robustly, even if the elements stored in the
* 'Sentence' are not of type Label.
*
* This one uses the default separators for any word type that uses
* separators, such as TaggedWord.
*
* @param list The tokenized sentence to print out
* @param justValue If {@code true} and the elements are of type
* {@code Label}, return just the
* {@code value()} of the {@code Label} of each word;
* otherwise,
* call the {@code toString()} method on each item.
* @return The sentence in String form
*/
public static <T> String listToString(List<T> list, final boolean justValue) {
return listToString(list, justValue, null);
}
/**
* As already described, but if separator is not null, then objects
* such as TaggedWord
*
* @param separator The string used to separate Word and Tag
* in TaggedWord, etc
*/
public static <T> String listToString(List<T> list, final boolean justValue,
final String separator) {
StringBuilder s = new StringBuilder();
for (Iterator<T> wordIterator = list.iterator(); wordIterator.hasNext();) {
T o = wordIterator.next();
s.append(wordToString(o, justValue, separator));
if (wordIterator.hasNext()) {
s.append(' ');
}
}
return s.toString();
}
/**
* Outputs a tree using the output style requested
*/
static void outputTree(PrintStream out, CoreMap sentence, List<Output> outputFormats) {
Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
for (Output output : outputFormats) {
switch (output) {
case PENNTREES: {
Tree copy = tree.deepCopy();
setSentimentLabels(copy);
out.println(copy);
break;
}
case VECTORS: {
Tree copy = tree.deepCopy();
setIndexLabels(copy, 0);
out.println(copy);
outputTreeVectors(out, tree, 0);
break;
}
case ROOT: {
out.println(" " + sentence.get(SentimentCoreAnnotations.SentimentClass.class));
break;
}
case PROBABILITIES: {
Tree copy = tree.deepCopy();
setIndexLabels(copy, 0);
out.println(copy);
outputTreeScores(out, tree, 0);
break;
}
default:
throw new IllegalArgumentException("Unknown output format " + output);
}
}
}
/**
* Reads an annotation from the given filename using the requested input.
*/
public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
switch (inputFormat) {
case TEXT: {
String text = IOUtils.slurpFileNoExceptions(filename);
Annotation annotation = new Annotation(text);
tokenizer.annotate(annotation);
List<Annotation> annotations = Generics.newArrayList();
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
annotations.add(nextAnnotation);
}
return annotations;
}
case TREES: {
List<Tree> trees;
if (filterUnknown) {
trees = SentimentUtils.readTreesWithGoldLabels(filename);
trees = SentimentUtils.filterUnknownRoots(trees);
} else {
trees = Generics.newArrayList();
MemoryTreebank treebank = new MemoryTreebank("utf-8");
treebank.loadPath(filename, null);
for (Tree tree : treebank) {
trees.add(tree);
}
}
List<Annotation> annotations = Generics.newArrayList();
for (Tree tree : trees) {
CoreMap sentence = new Annotation(listToString(tree.yield()));
sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
List<CoreMap> sentences = Collections.singletonList(sentence);
Annotation annotation = new Annotation("");
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
annotations.add(annotation);
}
return annotations;
}
default:
throw new IllegalArgumentException("Unknown format " + inputFormat);
}
}
/** Runs the tree-based sentiment model on some text. */
public void processTextWithArgs(String[] args) throws IOException {
String parserModel = null;
String sentimentModel = null;
String filename = null;
String fileList = null;
boolean stdin = false;
boolean filterUnknown = false;
List<Output> outputFormats = Collections.singletonList(Output.ROOT);
Input inputFormat = Input.TEXT;
String tlppClass = "DEFAULT_TLPP_CLASS";
for (int argIndex = 0; argIndex < args.length; ) {
if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
sentimentModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
parserModel = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-file")) {
filename = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-fileList")) {
fileList = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-stdin")) {
stdin = true;
argIndex++;
} else if (args[argIndex].equalsIgnoreCase("-input")) {
inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase());
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-output")) {
String[] formats = args[argIndex + 1].split(",");
outputFormats = new ArrayList<>();
for (String format : formats) {
outputFormats.add(Output.valueOf(format.toUpperCase()));
}
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) {
filterUnknown = true;
argIndex++;
} else if (args[argIndex].equalsIgnoreCase("-tlppClass")) {
tlppClass = args[argIndex + 1];
argIndex += 2;
} else if (args[argIndex].equalsIgnoreCase("-help")) {
System.exit(0);
} else {
log.info("Unknown argument " + args[argIndex + 1]);
throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]);
}
}
// We construct two pipelines. One handles tokenization, if
// necessary. The other takes tokenized sentences and converts
// them to sentiment trees.
Properties pipelineProps = new Properties();
Properties tokenizerProps = null;
if (sentimentModel != null) {
pipelineProps.setProperty("sentiment.model", sentimentModel);
}
if (parserModel != null) {
pipelineProps.setProperty("parse.model", parserModel);
}
if (inputFormat == Input.TREES) {
pipelineProps.setProperty("annotators", "binarizer, sentiment");
pipelineProps.setProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator");
pipelineProps.setProperty("binarizer.tlppClass", tlppClass);
pipelineProps.setProperty("enforceRequirements", "false");
} else {
pipelineProps.setProperty("annotators", "parse, sentiment");
pipelineProps.setProperty("enforceRequirements", "false");
tokenizerProps = new Properties();
tokenizerProps.setProperty("annotators", "tokenize, ssplit");
}
if (stdin && tokenizerProps != null) {
tokenizerProps.setProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "true");
}
int count = 0;
if (filename != null) count++;
if (fileList != null) count++;
if (stdin) count++;
if (count > 1) {
throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin");
}
if (count == 0) {
throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin");
}
StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps);
if (filename != null) {
// Process a file. The pipeline will do tokenization, which
// means it will split it into sentences as best as possible
// with the tokenizer.
List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown);
for (Annotation annotation : annotations) {
pipeline.annotate(annotation);
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
System.out.println(sentence);
outputTree(System.out, sentence, outputFormats);
}
}
} else if (fileList != null) {
// Process multiple files. The pipeline will do tokenization,
// which means it will split it into sentences as best as
// possible with the tokenizer. Output will go to filename.out
// for each file.
for (String file : fileList.split(",")) {
List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown);
FileOutputStream fout = new FileOutputStream(file + ".out");
PrintStream pout = new PrintStream(fout);
for (Annotation annotation : annotations) {
pipeline.annotate(annotation);
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
pout.println(sentence);
outputTree(pout, sentence, outputFormats);
}
}
pout.flush();
fout.close();
}
} else {
// Process stdin. Each line will be treated as a single sentence.
log.info("Reading in text from stdin.");
log.info("Please enter one sentence per line.");
log.info("Processing will end when EOF is reached.");
BufferedReader reader = IOUtils.readerFromStdin("utf-8");
for (String line; (line = reader.readLine()) != null; ) {
line = line.trim();
if ( ! line.isEmpty()) {
Annotation annotation = tokenizer.process(line);
pipeline.annotate(annotation);
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
outputTree(System.out, sentence, outputFormats);
}
} else {
// Output blank lines for blank lines so the tool can be
// used for line-by-line text processing
System.out.println();
}
}
}
}
public float getNumericSentimentValue(String expression) {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
int mainSentiment = 0;
if (expression != null && expression.length() > 0) {
int longest = 0;
Annotation annotation = pipeline.process(expression);
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Tree tree = sentence.get(SentimentAnnotatedTree.class);
int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
String partText = sentence.toString();
if (partText.length() > longest) {
mainSentiment = sentiment;
longest = partText.length();
}
}
}
return mainSentiment;
}
}