opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;

 import opennlp.tools.similarity.apps.utils.StringCleaner;
 import opennlp.tools.stemmer.PorterStemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 public class StoryDiscourseNavigator {
 	protected BingQueryRunner yrunner = new BingQueryRunner();
 	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
 			.getInstance();
 	private PorterStemmer ps = new PorterStemmer();

 	public static final String[] frequentPerformingVerbs = {
 		" born raised meet learn ", " graduated enter discover",
 		" facts inventions life ", "accomplishments childhood timeline",
 		" acquire befriend encounter", " achieve reache describe ",
 		" invent innovate improve ", " impress outstanding award",
 		" curous sceptical pessimistic", " spend enroll assume point",
 		" explain discuss dispute", " learn teach study investigate",
 		" propose suggest indicate", " pioneer explorer discoverer ",
 		" advance promote lead", " direct control simulate ",
 		" guide lead assist ", " inspire first initial",
 		" vision predict foresee", " prediction inspiration achievement",
 		" approve agree confirm", " deny argue disagree",
 		" emotional loud imagination", " release announce celebrate discover",
 		"introduce enjoy follow", " open present show",
 		"meet enjoy follow create", "discover continue produce"

 	};

 	public String[] obtainAdditionalKeywordsForAnEntity(String entity){
 		List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
 				entity, "", "en", 30);
 		Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false);
 		List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList);
 		String[] res = new String[resList.size()];
 		int i=0;
 		for(List<String> phrase: resList){
 			phrase.removeAll(keywordsToRemove);
 			String keywords = phrase.toString().replace('[', ' ').replace(']', ' ').replace(',',' ');
 			res[i] = keywords;
 			i++;
 		}
 		return res;
 	}

 	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
 			String domain, String lang, int numbOfHits) {
 		List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
 		try {
 			List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);

 			for (int i = 0; i < resultList.size(); i++) {
 				{
 					for (int j = i + 1; j < resultList.size(); j++) {
 						HitBase h1 = resultList.get(i);
 						HitBase h2 = resultList.get(j);
 						String snapshot1 = StringCleaner.processSnapshotForMatching(h1
 								.getTitle() + " . " + h1.getAbstractText());
 						String snapshot2 = StringCleaner.processSnapshotForMatching(h2
 								.getTitle() + " . " + h2.getAbstractText());
 						SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
 								snapshot2);
 						List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
 						genResult.addAll(matchResult);
 					}
 				}
 			}

 		} catch (Exception e) {
 			System.err.print("Problem extracting taxonomy node");
 		}

 		return genResult;
 	}
 	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
 			List<List<ParseTreeChunk>> matchList) {
 		List<List<String>> res = new ArrayList<List<String>>();
 		for (List<ParseTreeChunk> chunks : matchList) {
 			List<String> wordRes = new ArrayList<String>();
 			for (ParseTreeChunk ch : chunks) {
 				List<String> lemmas = ch.getLemmas();
 				for (int w = 0; w < lemmas.size(); w++)
 					if ((!lemmas.get(w).equals("*"))
 							&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
 									.startsWith("VB"))) && lemmas.get(w).length() > 2) {
 						String formedWord = lemmas.get(w);
 						String stemmedFormedWord = ps.stem(formedWord);
 						if (!stemmedFormedWord.startsWith("invalid"))
 							wordRes.add(formedWord);
 					}
 			}
 			wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
 			if (wordRes.size() > 0) {
 				res.add(wordRes);
 			}
 		}
 		res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
 		return res;
 	}
 	public static void main(String[] args){
 		String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
 		System.out.println(Arrays.asList(res));
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.similarity.apps;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.HashSet;
	import java.util.List;

	import opennlp.tools.similarity.apps.utils.StringCleaner;
	import opennlp.tools.stemmer.PorterStemmer;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.TextProcessor;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	public class StoryDiscourseNavigator {
	protected BingQueryRunner yrunner = new BingQueryRunner();
	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
	.getInstance();
	private PorterStemmer ps = new PorterStemmer();

	public static final String[] frequentPerformingVerbs = {
	" born raised meet learn ", " graduated enter discover",
	" facts inventions life ", "accomplishments childhood timeline",
	" acquire befriend encounter", " achieve reache describe ",
	" invent innovate improve ", " impress outstanding award",
	" curous sceptical pessimistic", " spend enroll assume point",
	" explain discuss dispute", " learn teach study investigate",
	" propose suggest indicate", " pioneer explorer discoverer ",
	" advance promote lead", " direct control simulate ",
	" guide lead assist ", " inspire first initial",
	" vision predict foresee", " prediction inspiration achievement",
	" approve agree confirm", " deny argue disagree",
	" emotional loud imagination", " release announce celebrate discover",
	"introduce enjoy follow", " open present show",
	"meet enjoy follow create", "discover continue produce"

	};

	public String[] obtainAdditionalKeywordsForAnEntity(String entity){
	List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
	entity, "", "en", 30);
	Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false);
	List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(matchList);
	String[] res = new String[resList.size()];
	int i=0;
	for(List<String> phrase: resList){
	phrase.removeAll(keywordsToRemove);
	String keywords = phrase.toString().replace('[', ' ').replace(']', ' ').replace(',',' ');
	res[i] = keywords;
	i++;
	}
	return res;
	}

	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
	String domain, String lang, int numbOfHits) {
	List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
	try {
	List<HitBase> resultList = yrunner.runSearch(query, numbOfHits);

	for (int i = 0; i < resultList.size(); i++) {
	{
	for (int j = i + 1; j < resultList.size(); j++) {
	HitBase h1 = resultList.get(i);
	HitBase h2 = resultList.get(j);
	String snapshot1 = StringCleaner.processSnapshotForMatching(h1
	.getTitle() + " . " + h1.getAbstractText());
	String snapshot2 = StringCleaner.processSnapshotForMatching(h2
	.getTitle() + " . " + h2.getAbstractText());
	SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
	snapshot2);
	List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
	genResult.addAll(matchResult);
	}
	}
	}

	} catch (Exception e) {
	System.err.print("Problem extracting taxonomy node");
	}

	return genResult;
	}
	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
	List<List<ParseTreeChunk>> matchList) {
	List<List<String>> res = new ArrayList<List<String>>();
	for (List<ParseTreeChunk> chunks : matchList) {
	List<String> wordRes = new ArrayList<String>();
	for (ParseTreeChunk ch : chunks) {
	List<String> lemmas = ch.getLemmas();
	for (int w = 0; w < lemmas.size(); w++)
	if ((!lemmas.get(w).equals("*"))
	&& ((ch.getPOSs().get(w).startsWith("NN") \|\| ch.getPOSs().get(w)
	.startsWith("VB"))) && lemmas.get(w).length() > 2) {
	String formedWord = lemmas.get(w);
	String stemmedFormedWord = ps.stem(formedWord);
	if (!stemmedFormedWord.startsWith("invalid"))
	wordRes.add(formedWord);
	}
	}
	wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
	if (wordRes.size() > 0) {
	res.add(wordRes);
	}
	}
	res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
	return res;
	}
	public static void main(String[] args){
	String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein");
	System.out.println(Arrays.asList(res));
	}
	}