opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/opinion_processor/NamedEntityExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.opinion_processor;

 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

 import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
 import opennlp.tools.jsmlearning.ProfileReaderWriter;
 import opennlp.tools.parse_thicket.ParseThicket;
 import opennlp.tools.parse_thicket.ParseTreeNode;
 import opennlp.tools.parse_thicket.VerbNetProcessor;
 import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
 import opennlp.tools.parse_thicket.matching.Matcher;
 import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
 import opennlp.tools.similarity.apps.utils.Pair;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.TextProcessor;

 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringUtils;

 public class NamedEntityExtractor {
 	protected static Matcher matcher;
 	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
 	protected ArrayList<File> queue = new ArrayList<File>();
 	protected static PT2ThicketPhraseBuilder phraseBuilder;
 	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
 	String resourceDirSentimentList = null;
 	Set<String> sentimentVcb = new HashSet<String> ();

 	static {
 		synchronized (NamedEntityExtractor.class) {
 			matcher = new Matcher();
 			phraseBuilder = new PT2ThicketPhraseBuilder();
 		}
 	}

 	public NamedEntityExtractor(){
 		try {
 			resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
 		List<String[]> sentimentList=null;
 		sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
 		for(String[] line: sentimentList){
 			sentimentVcb.add(line[0]);
 		}
 	}

 	protected boolean isSentimentWord(String word){
 		if (sentimentVcb.contains(word))
 			return true;
 		else
 			return false;
 	}

 	public EntityExtractionResult extractEntities(String para){
 		List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
 		List<String> extractedNERsWords = new ArrayList<String>();
 		List<List<ParseTreeNode>> extractedSentimentPhrases =
 				new ArrayList<List<ParseTreeNode>>();
 		EntityExtractionResult result = new EntityExtractionResult();

 		ParseThicket pt = null;

 		System.out.println("Processing paragraph of length "+para.length() + " | "+ para);
 		pt = matcher.buildParseThicketFromTextWithRST(para);
 		List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();


 		for(List<ParseTreeNode> sentence: nodeList){
 			//System.out.println("   Processing sentence: "+ sentence);
 			boolean bInsideNER = false;
 			String currentPhrase = "";
 			List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>();
 			for(ParseTreeNode word: sentence){
 				if (isNERforPhraseExtraction(word)){
 					//System.out.println("++Found word ="+word + " | NER="+ word.getNe());
 					if (bInsideNER){
 						currentPhrase += " "+word.getWord();
 						currentPhraseNode.add(word);
 					} else {
 						bInsideNER=true;
 						currentPhrase = word.getWord();
 						currentPhraseNode.add(word);
 					}
 				} else {
 					if (bInsideNER){
 						if (currentPhrase.indexOf(' ')>-1) // at least two tokens
 							extractedNERsWords.add(currentPhrase);
 							extractedNERs.add(currentPhraseNode);
 						currentPhrase = "";
 						bInsideNER=false;
 					} else {
 						// do nothing, continue scan
 					}
 				}
 			}
 			if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
 				extractedNERs.add(currentPhraseNode);
 				extractedNERsWords.add(currentPhrase);
 			}

 			Set<String> foundSentimentWords = new HashSet<String>();
 			// now we extract phrases
 			List<List<ParseTreeNode>> phrases = pt.getPhrases();
 			for(List<ParseTreeNode> phrase: phrases){
 				// find a noun phrase under sentiment
 				try {
 					for(int i = phrase.size()-1; i>-1; i--){
 						ParseTreeNode word = phrase.get(i);
 						if ((isSentimentWord(word.getWord()) ||
 								sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
 							foundSentimentWords.add(word.getWord());
 							System.out.println("Sentim = " + word.getWord() + " | Found opinionated phrase "+phrase.toString());
 							if (phrase.size()>1 && phrase.size()<7)
 								extractedSentimentPhrases.add(phrase);
 							break;
 						}
 					}
 				} catch (Exception e) {
 					e.printStackTrace();
 				}
 			}

 		}

 		extractedSentimentPhrases = reduceExtractedPhrases(extractedSentimentPhrases);

 		result.setExtractedNER(extractedNERs);
 		result.setExtractedNERWords(extractedNERsWords);
 		result.setExtractedSentimentPhrases(extractedSentimentPhrases);
 		return result;
 	}

 	private List<List<ParseTreeNode>> reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
 	    List<Integer> idsToDelete = new ArrayList<Integer>();
 		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
 			for(int j = i+1; j<extractedSentimentPhrases.size(); j++){
 				String phrStr1 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
 				String phrStr2 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
 				if (phrStr1 .indexOf(phrStr2 )>-1)
 					idsToDelete.add(j);
 			}
 		}
 		List<List<ParseTreeNode>> resultPhrases = new ArrayList<List<ParseTreeNode>>();
 		for(int i = 0; i<extractedSentimentPhrases.size(); i++){
 			if (!idsToDelete.contains(i))
 				resultPhrases .add(extractedSentimentPhrases.get(i));
 		}
 	    return resultPhrases ;
     }

 	private boolean isNERforPhraseExtraction(ParseTreeNode word){
 		if ((word.getNe().equals("ORGANIZATION") ||word.getNe().equals("LOCATION") || word.getNe().equals("PERSON") ) &&
 				(word.getPos().startsWith("NN") || word.getPos().startsWith("PR") || word.getPos().startsWith("IN")||
 						word.getPos().startsWith("JJ") || word.getPos().startsWith("DT")  ))
 			return true;

 		return false;

 	}


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.opinion_processor;

	import java.io.File;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;

	import opennlp.tools.apps.relevanceVocabs.SentimentVocab;
	import opennlp.tools.jsmlearning.ProfileReaderWriter;
	import opennlp.tools.parse_thicket.ParseThicket;
	import opennlp.tools.parse_thicket.ParseTreeNode;
	import opennlp.tools.parse_thicket.VerbNetProcessor;
	import opennlp.tools.parse_thicket.kernel_interface.DescriptiveParagraphFromDocExtractor;
	import opennlp.tools.parse_thicket.matching.Matcher;
	import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
	import opennlp.tools.similarity.apps.utils.Pair;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.TextProcessor;

	import org.apache.commons.io.FileUtils;
	import org.apache.commons.lang.StringUtils;

	public class NamedEntityExtractor {
	protected static Matcher matcher;
	private static int PARA_LENGTH_IN_SENTENCES = 5, PARA_LENGTH = 250;
	protected ArrayList<File> queue = new ArrayList<File>();
	protected static PT2ThicketPhraseBuilder phraseBuilder;
	protected static SentimentVocab sVocab = SentimentVocab.getInstance();
	String resourceDirSentimentList = null;
	Set<String> sentimentVcb = new HashSet<String> ();

	static {
	synchronized (NamedEntityExtractor.class) {
	matcher = new Matcher();
	phraseBuilder = new PT2ThicketPhraseBuilder();
	}
	}

	public NamedEntityExtractor(){
	try {
	resourceDirSentimentList = new File( "." ).getCanonicalPath()+"/src/test/resources/opinions/sentiment_listReduced.csv";
	} catch (IOException e) {
	e.printStackTrace();
	}
	List<String[]> sentimentList=null;
	sentimentList = ProfileReaderWriter.readProfiles(resourceDirSentimentList);
	for(String[] line: sentimentList){
	sentimentVcb.add(line[0]);
	}
	}

	protected boolean isSentimentWord(String word){
	if (sentimentVcb.contains(word))
	return true;
	else
	return false;
	}

	public EntityExtractionResult extractEntities(String para){
	List<List<ParseTreeNode>> extractedNERs = new ArrayList<List<ParseTreeNode>>();
	List<String> extractedNERsWords = new ArrayList<String>();
	List<List<ParseTreeNode>> extractedSentimentPhrases =
	new ArrayList<List<ParseTreeNode>>();
	EntityExtractionResult result = new EntityExtractionResult();

	ParseThicket pt = null;

	System.out.println("Processing paragraph of length "+para.length() + " \| "+ para);
	pt = matcher.buildParseThicketFromTextWithRST(para);
	List<List<ParseTreeNode>> nodeList = pt.getSentenceNodes();


	for(List<ParseTreeNode> sentence: nodeList){
	//System.out.println(" Processing sentence: "+ sentence);
	boolean bInsideNER = false;
	String currentPhrase = "";
	List<ParseTreeNode> currentPhraseNode = new ArrayList<ParseTreeNode>();
	for(ParseTreeNode word: sentence){
	if (isNERforPhraseExtraction(word)){
	//System.out.println("++Found word ="+word + " \| NER="+ word.getNe());
	if (bInsideNER){
	currentPhrase += " "+word.getWord();
	currentPhraseNode.add(word);
	} else {
	bInsideNER=true;
	currentPhrase = word.getWord();
	currentPhraseNode.add(word);
	}
	} else {
	if (bInsideNER){
	if (currentPhrase.indexOf(' ')>-1) // at least two tokens
	extractedNERsWords.add(currentPhrase);
	extractedNERs.add(currentPhraseNode);
	currentPhrase = "";
	bInsideNER=false;
	} else {
	// do nothing, continue scan
	}
	}
	}
	if (currentPhrase.length()>1 && currentPhrase.indexOf(' ')>-1){
	extractedNERs.add(currentPhraseNode);
	extractedNERsWords.add(currentPhrase);
	}

	Set<String> foundSentimentWords = new HashSet<String>();
	// now we extract phrases
	List<List<ParseTreeNode>> phrases = pt.getPhrases();
	for(List<ParseTreeNode> phrase: phrases){
	// find a noun phrase under sentiment
	try {
	for(int i = phrase.size()-1; i>-1; i--){
	ParseTreeNode word = phrase.get(i);
	if ((isSentimentWord(word.getWord()) \|\|
	sVocab.isSentimentWord(word.getWord()) && !foundSentimentWords.contains(word.getWord()) )){
	foundSentimentWords.add(word.getWord());
	System.out.println("Sentim = " + word.getWord() + " \| Found opinionated phrase "+phrase.toString());
	if (phrase.size()>1 && phrase.size()<7)
	extractedSentimentPhrases.add(phrase);
	break;
	}
	}
	} catch (Exception e) {
	e.printStackTrace();
	}
	}

	}

	extractedSentimentPhrases = reduceExtractedPhrases(extractedSentimentPhrases);

	result.setExtractedNER(extractedNERs);
	result.setExtractedNERWords(extractedNERsWords);
	result.setExtractedSentimentPhrases(extractedSentimentPhrases);
	return result;
	}

	private List<List<ParseTreeNode>> reduceExtractedPhrases(List<List<ParseTreeNode>> extractedSentimentPhrases) {
	List<Integer> idsToDelete = new ArrayList<Integer>();
	for(int i = 0; i<extractedSentimentPhrases.size(); i++){
	for(int j = i+1; j<extractedSentimentPhrases.size(); j++){
	String phrStr1 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(i));
	String phrStr2 = ParseTreeNode.toWordString(extractedSentimentPhrases.get(j));
	if (phrStr1 .indexOf(phrStr2 )>-1)
	idsToDelete.add(j);
	}
	}
	List<List<ParseTreeNode>> resultPhrases = new ArrayList<List<ParseTreeNode>>();
	for(int i = 0; i<extractedSentimentPhrases.size(); i++){
	if (!idsToDelete.contains(i))
	resultPhrases .add(extractedSentimentPhrases.get(i));
	}
	return resultPhrases ;
	}

	private boolean isNERforPhraseExtraction(ParseTreeNode word){
	if ((word.getNe().equals("ORGANIZATION") \|\|word.getNe().equals("LOCATION") \|\| word.getNe().equals("PERSON") ) &&
	(word.getPos().startsWith("NN") \|\| word.getPos().startsWith("PR") \|\| word.getPos().startsWith("IN")\|\|
	word.getPos().startsWith("JJ") \|\| word.getPos().startsWith("DT") ))
	return true;

	return false;

	}


	}