blob: b766c7c9c73646aaf729124ea215e9d1f57f1425 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.opinion_processor;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
import opennlp.tools.jsmlearning.ProfileReaderWriter;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.parse_thicket.VerbNetProcessor;
import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
import opennlp.tools.parse_thicket.matching.Matcher;
import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
import opennlp.tools.similarity.apps.utils.Pair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.TextProcessor;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
public class NamedEntityExtractor {
protected static Matcher matcher;
private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
protected ArrayList<File> queue = new ArrayList<File>();
protected static PT2ThicketPhraseBuilder phraseBuilder;
protected static SentimentVocab sVocab = SentimentVocab.getInstance();
String resourceDirSentimentList = null;
Set<String> sentimentVcb = new HashSet<String> ();
static {
synchronized (NamedEntityExtractor.class) {
matcher = new Matcher();
phraseBuilder = new PT2ThicketPhraseBuilder();
}
}
public NamedEntityExtractor(){
try {
resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
} catch (IOException e) {
e.printStackTrace();
}
List<String[]> sentimentList=null;
sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
for(String[] line: sentimentList){
sentimentVcb.add(line[0]);
}
}
protected boolean isSentimentWord(String word){
if (sentimentVcb.contains(word))
return true;
else
return false;
}
public EntityExtractionResult extractEntities(String para){
List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
List<String> extractedNERsWords = new ArrayList<String>();
List<List<ParseTreeNode>> extractedSentimentPhrases =
new ArrayList<List<ParseTreeNode>>();
EntityExtractionResult result = new EntityExtractionResult();
ParseThicket pt = null;
System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
pt = matcher.buildParseThicketFromTextWithRST(para);
List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();
for(List<ParseTreeNode> sentence: nodeList){
//System.out.println(" Processing sentence: "+ sentence);
boolean bInsideNER = false;
String currentPhrase = "";
List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>();
for(ParseTreeNode word: sentence){
if (isNERforPhraseExtraction(word)){
//System.out.println("++Found word ="+word + " | NER="+ word.getNe());
if (bInsideNER){
currentPhrase += " "+word.getWord();
currentPhraseNode.add(word);
} else {
bInsideNER=true;
currentPhrase = word.getWord();
currentPhraseNode.add(word);
}
} else {
if (bInsideNER){
if (currentPhrase.indexOf(' ')>-1) // at least two tokens
extractedNERsWords.add(currentPhrase);
extractedNERs.add(currentPhraseNode);
currentPhrase = "";
bInsideNER=false;
} else {
// do nothing, continue scan
}
}
}
if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
extractedNERs.add(currentPhraseNode);
extractedNERsWords.add(currentPhrase);
}
Set<String> foundSentimentWords = new HashSet<String>();
// now we extract phrases
List<List<ParseTreeNode>> phrases = pt.getPhrases();
for(List<ParseTreeNode> phrase: phrases){
// find a noun phrase under sentiment
try {
for(int i = phrase.size()-1; i>-1; i--){
ParseTreeNode word = phrase.get(i);
if ((isSentimentWord(word.getWord()) ||
sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
foundSentimentWords.add(word.getWord());
System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase "+phrase.toString());
if (phrase.size()>1 && phrase.size()<7)
extractedSentimentPhrases.add(phrase);
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
extractedSentimentPhrases = reduceExtractedPhrases(extractedSentimentPhrases);
result.setExtractedNER(extractedNERs);
result.setExtractedNERWords(extractedNERsWords);
result.setExtractedSentimentPhrases(extractedSentimentPhrases);
return result;
}
private List<List<ParseTreeNode>> reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
List<Integer> idsToDelete = new ArrayList<Integer>();
for(int i = 0; i<extractedSentimentPhrases.size(); i++){
for(int j = i+1; j<extractedSentimentPhrases.size(); j++){
String phrStr1 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
String phrStr2 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
if (phrStr1 .indexOf(phrStr2 )>-1)
idsToDelete.add(j);
}
}
List<List<ParseTreeNode>> resultPhrases = new ArrayList<List<ParseTreeNode>>();
for(int i = 0; i<extractedSentimentPhrases.size(); i++){
if (!idsToDelete.contains(i))
resultPhrases .add(extractedSentimentPhrases.get(i));
}
return resultPhrases ;
}
private boolean isNERforPhraseExtraction(ParseTreeNode word){
if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")||
word.getPos().startsWith("JJ") || word.getPos().startsWith("DT") ))
return true;
return false;
}
}