opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Logger;


 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 import org.apache.commons.lang.StringUtils;

 /*
  * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
  * expected to be readable by humans.
  *
  * These are examples of generated articles, given the article title
  * http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
  * http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
  *
  */

 public class RelatedSentenceFinder
 {
 	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
 	PageFetcher pFetcher = new PageFetcher();

 	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
 	private ParseTreeChunk parseTreeChunk  = new ParseTreeChunk();

 	static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

 	// used to indicate that a sentence is an opinion, so more appropriate
 	static List<String> MENTAL_VERBS = new ArrayList<String>(Arrays.asList(new String[] { "want", "know", "believe",
 			"appeal", "ask", "accept", "agree", "allow", "appeal", "ask", "assume", "believe", "check", "confirm",
 			"convince", "deny", "disagree", "explain", "ignore", "inform", "remind", "request", "suggest", "suppose",
 			"think", "threaten", "try", "understand" }));

 	private static final int MAX_FRAGMENT_SENTS = 10;

 	public RelatedSentenceFinder()
 	{

 	}

 	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word, List<String> sents) throws Exception
 	{
 		BingWebQueryRunner yrunner = new BingWebQueryRunner();
 		List<HitBase> searchResult = yrunner.runSearch(word);
 		return searchResult;
 	}


 	public List<HitBase> findRelatedOpinionsForSentence(String sentence, List<String> sents) throws Exception
 	{
 		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
 		System.out.println(" \n\n=== Sentence  = " + sentence);
 		List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

 		BingWebQueryRunner yrunner = new BingWebQueryRunner();
 		for (String query : nounPhraseQueries)
 		{
 			System.out.println("\nquery = " + query);
 			// query += " "+join(MENTAL_VERBS, " OR ") ;
 			List<HitBase> searchResult = yrunner.runSearch(query);
 			if (searchResult != null)
 			{
 				for (HitBase item : searchResult)
 				{ // got some text from .html
 					if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
 					{ // exclude
 						// pdf
 						opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, sents));
 					}
 				}
 			}
 		}

 		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
 		return opinionSentencesToAdd;
 	}

 	/**
 	   * Main content generation function which takes a seed as a person, rock group, or other entity name and produce a list of text fragments by web mining for
 	   *  <br>
 	   * @param String entity name
 	   * @return List<HitBase> of text fragment structures which contain approved (in terms of relevance) mined sentences, as well as original search results objects
 	   * such as doc titles, abstracts, and urls.
 	   */

 	public List<HitBase> generateContentAbout(String sentence) throws Exception
 	{
 		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
 		System.out.println(" \n=== Entity to write about = " + sentence);
 		List<String> nounPhraseQueries = new ArrayList<String>();


 		//nounPhraseQueries.add(sentence + frequentPerformingVerbs);

 		BingWebQueryRunner yrunner = new BingWebQueryRunner();
 		for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs)
 		{
 			List<HitBase> searchResult = yrunner.runSearch(sentence + " " + verbAddition);
 			if (searchResult != null)
 			{
 				for (HitBase item : searchResult)
 				{ // got some text from .html
 					if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
 					{ // exclude pdf
 						opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, null));
 					}
 				}
 			}
 		}

 		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
 		return opinionSentencesToAdd;
 	}

 	/**
 	   * Takes a sentence and extracts noun phrases and entity names to from search queries for finding relevant sentences on the web, which are
 	   * then subject to relevance assessment by Similarity. Search queries should not be too general (irrelevant search results) or too specific (too few
 	   * search results)
 	   * @param String input sentence to form queries
 	   * @return List<String> of search expressions
 	   */
 	public static List<String> buildSearchEngineQueryFromSentence(String sentence)
 	{
 		ParseTreeChunk matcher = new ParseTreeChunk();
 		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();
 		List<List<ParseTreeChunk>> sent1GrpLst = null;

 		List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
 		List<String> queryArrayStr = new ArrayList<String>();
 		for (ParseTreeChunk ch : nPhrases)
 		{
 			String query = "";
 			int size = ch.getLemmas().size();

 			for (int i = 0; i < size; i++)
 			{
 				if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
 				{
 					query += ch.getLemmas().get(i) + " ";
 				}
 			}
 			query = query.trim();
 			int len = query.split(" ").length;
 			if (len < 2 || len > 5)
 				continue;
 			if (len < 4)
 			{ // every word should start with capital
 				String[] qs = query.split(" ");
 				boolean bAccept = true;
 				for (String w : qs)
 				{
 					if (w.toLowerCase().equals(w)) // idf only two words then
 						// has to be person name,
 						// title or geo location
 						bAccept = false;
 				}
 				if (!bAccept)
 					continue;
 			}

 			query = query.trim().replace(" ", " +");
 			query = " +" + query;

 			queryArrayStr.add(query);

 		}
 		if (queryArrayStr.size() < 1)
 		{ // release constraints on NP down to 2
 			// keywords
 			for (ParseTreeChunk ch : nPhrases)
 			{
 				String query = "";
 				int size = ch.getLemmas().size();

 				for (int i = 0; i < size; i++)
 				{
 					if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
 					{
 						query += ch.getLemmas().get(i) + " ";
 					}
 				}
 				query = query.trim();
 				int len = query.split(" ").length;
 				if (len < 2)
 					continue;

 				query = query.trim().replace(" ", " +");
 				query = " +" + query;

 				queryArrayStr.add(query);

 			}
 		}

 		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
 		queryArrayStr.add(sentence);

 		return queryArrayStr;

 	}

 	/** remove dupes from queries to easy cleaning dupes and repetitive search
 	 * afterwards
 	 *
 	 * @param List<String> of sentences (search queries, or search results abstracts, or titles
 	 * @return List<String> of sentences where dupes are removed
 	 */
 	public static List<String> removeDuplicatesFromQueries(List<String> hits)
 	{
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		double dupeThresh = 0.8; // if more similar, then considered dupes was
 		// 0.7
 		List<Integer> idsToRemove = new ArrayList<Integer>();
 		List<String> hitsDedup = new ArrayList<String>();
 		try
 		{
 			for (int i = 0; i < hits.size(); i++)
 				for (int j = i + 1; j < hits.size(); j++)
 				{
 					String title1 = hits.get(i);
 					String title2 = hits.get(j);
 					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
 						continue;
 					if (meas.measureStringDistance(title1, title2) > dupeThresh)
 					{
 						idsToRemove.add(j); // dupes found, later list member to
 						// be deleted

 					}
 				}

 			for (int i = 0; i < hits.size(); i++)
 				if (!idsToRemove.contains(i))
 					hitsDedup.add(hits.get(i));

 			if (hitsDedup.size() < hits.size())
 			{
 				LOG.info("Removed duplicates from formed query, including " + hits.get(idsToRemove.get(0)));
 			}

 		}
 		catch (Exception e)
 		{
 			LOG.severe("Problem removing duplicates from query list");
 		}

 		return hitsDedup;

 	}

 	/** remove dupes from search results
 	 *
 	 * @param List<HitBase> of search results objects
 	 * @return List<String> of search results objects  where dupes are removed
 	 */
 	public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits)
 	{
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		double dupeThresh = //0.8; // if more similar, then considered dupes was
 		 0.7;
 		List<Integer> idsToRemove = new ArrayList<Integer>();
 		List<HitBase> hitsDedup = new ArrayList<HitBase>();
 		try
 		{
 			for (int i = 0; i < hits.size(); i++)
 				for (int j = i + 1; j < hits.size(); j++)
 				{
 					HitBase hit2 = hits.get(j);
 					List<Fragment> fragmList1 =  hits.get(i).getFragments();
 					List<Fragment> fragmList2 =  hits.get(j).getFragments();
 					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
 					for(Fragment f1: fragmList1)
 						for(Fragment f2: fragmList2){
 							String sf1 = f1.getResultText();
 							String sf2 = f2.getResultText();
 							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
 								continue;
 							if (meas.measureStringDistance(sf1, sf2) > dupeThresh)
 							{
 								fragmList2Results.remove(f2);
 								LOG.info("Removed duplicates from formed fragments list: " + sf2);
 							}
 						}

 					hit2.setFragments(fragmList2Results);
 					hits.set(j, hit2 );
 				}
 		}
 		catch (Exception e)
 		{
 			LOG.severe("Problem removing duplicates from list of fragment");
 		}
 		return hits;
 	}
 	/**
 	 * Takes single search result for an entity which is the subject of the essay to be written and forms essey sentences
 	 * from the title, abstract, and possibly original page
 	 * @param HitBase item : search result
 	 * @param originalSentence : seed for the essay to be written
 	 * @param sentsAll: list<String> of other sentences in the seed if it is multi-sentence
 	 * @return search result
 	 */

 	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, String originalSentence,
 			List<String> sentsAll)
 	{
 		if (sentsAll==null)
 			sentsAll = new ArrayList<String>();
 		// put orig sentence in structure
 		List<String> origs = new ArrayList<String>();
 		origs.add(originalSentence);
 		item.setOriginalSentences(origs);
 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<Fragment> result = new ArrayList<Fragment>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace("  ", " ")
 		.replace("  ", " ");

 		ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
 		// fix a template expression which can be substituted by original if
 		// relevant
 		String snapshotMarked = snapshot.replace("...", " _should_find_orig_ . _should_find_orig_");
 		String[] fragments = sm.splitSentences(snapshotMarked);
 		List<String> allFragms = new ArrayList<String>();
 		allFragms.addAll(Arrays.asList(fragments));

 		String[] sents = null; String downloadedPage;
 		try
 		{
 			if (snapshotMarked.length() != snapshot.length())
 			{
 				downloadedPage = pFetcher.fetchPage(item.getUrl());
 				if (downloadedPage != null && downloadedPage.length() > 100)
 				{
 					item.setPageContent(downloadedPage);
 					String pageContent = Utils.fullStripHTML(item.getPageContent());
 					pageContent = GeneratedSentenceProcessor.normalizeForSentenceSplitting(pageContent);
 					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")//.replace("  ", ". ")
 					.replace("..", ".").replace(". . .", " ")
 					.trim(); // sometimes html breaks are converted into ' ' (two spaces), so we need to put '.'
 					sents = sm.splitSentences(snapshotMarked);;
 					sents = cleanListOfSents(sents);
 				}
 			}
 		}
 		catch (Exception e)
 		{
 			// TODO Auto-generated catch block
 			// e.printStackTrace();
 			System.err.println("Problem downloading  the page and splitting into sentences");
 			return item;
 		}

 		for (String fragment : allFragms)
 		{
 			String followSent = null;
 			if (fragment.length() < 50)
 				continue;
 			String pageSentence = "";
 			// try to find original sentence from webpage
 			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null && sents.length > 0)
 				try
 			{
 					String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sents);
 					pageSentence = mainAndFollowSent[0];
 					followSent = mainAndFollowSent[1];

 			}
 			catch (Exception e)
 			{

 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
 			else
 				// or get original snippet
 				pageSentence = fragment;
 			if (pageSentence != null)
 				pageSentence.replace("_should_find_orig_", "");

 			// resultant sentence SHOULD NOT be longer than twice the size of
 			// snippet fragment
 			if (pageSentence != null && (float) pageSentence.length() / (float) fragment.length() < 4.0)
 			{ // was 2.0, but since snippet sentences are rather short now...
 				try
 				{ // get score from syntactic match between sentence in
 					// original text and mined sentence
 					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

 					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence + " " + title, originalSentence);
 					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
 					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb())
 					{
 						System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence);
 						continue;
 					}

 					syntScore =parseTreeChunkListScorer.getParseTreeChunkListScore(match);
 					System.out.println(parseTreeChunk.listToString(match) + " " + syntScore
 							+ "\n pre-processed sent = '" + pageSentence);

 					if (syntScore < 1.5)
 					{ // trying other sents
 						for (String currSent : sentsAll)
 						{
 							if (currSent.startsWith(originalSentence))
 								continue;
 							match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
 							double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
 							if (syntScoreCurr > syntScore)
 							{
 								syntScore = syntScoreCurr;
 							}
 						}
 						if (syntScore > 1.5)
 						{
 							System.out.println("Got match with other sent: " + parseTreeChunk.listToString(match) + " "
 									+ syntScore);
 						}
 					}

 					measScore = STRING_DISTANCE_MEASURER.measureStringDistance(originalSentence, pageSentence);

 					// now possibly increase score by finding mental verbs
 					// indicating opinions
 					for (String s : MENTAL_VERBS)
 					{
 						if (pageSentence.indexOf(s) > -1)
 						{
 							mentalScore += 0.3;
 							break;
 						}
 					}

 					if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5) && measScore < 0.8
 							&& pageSentence.length() > 40) // >70
 					{
 						String pageSentenceProc = GeneratedSentenceProcessor.acceptableMinedSentence(pageSentence);
 						if (pageSentenceProc != null)
 						{
 							pageSentenceProc = GeneratedSentenceProcessor.processSentence(pageSentenceProc);
 							if (followSent != null)
 							{
 								pageSentenceProc += " " + GeneratedSentenceProcessor.processSentence(followSent);
 							}

 							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
 							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore + mentalScore
 									+ (double) pageSentenceProc.length() / (double) 50);
 							f.setSourceURL(item.getUrl());
 							f.fragment = fragment;
 							result.add(f);
 							System.out.println("Accepted sentence: " + pageSentenceProc + "| with title= " + title);
 							System.out.println("For fragment = " + fragment);
 						}
 						else
 							System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence);
 					}
 					else
 						System.out.println("Rejected sentence due to low score: " + pageSentence);
 					// }
 				}
 				catch (Throwable t)
 				{
 					t.printStackTrace();
 				}
 			}
 		}
 		item.setFragments(result);
 		return item;
 	}

 	public static String[] cleanListOfSents(String[] sents)
 	{
 		List<String> sentsClean = new ArrayList<String>();
 		for (String s : sents)
 		{
 			if (s == null || s.trim().length() < 30 || s.length() < 20)
 				continue;
 			sentsClean.add(s);
 		}
 		return (String[]) sentsClean.toArray(new String[0]);
 	}

 	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
 	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
 	{
 		if (fragment.trim().length() < 15)
 			return null;

 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		Double dist = 0.0;
 		String result = null, followSent = null;
 		for (int i = 0; i < sents.length; i++)
 		{
 			String s = sents[i];
 			if (s == null || s.length() < 30)
 				continue;
 			Double distCurr = meas.measureStringDistance(s, fragment);
 			if (distCurr > dist && distCurr > 0.4)
 			{
 				result = s;
 				dist = distCurr;
 				if (i < sents.length - 1 && sents[i + 1].length() > 60)
 				{
 					followSent = sents[i + 1];
 				}

 			}
 		}
 		return new String[] { result, followSent };
 	}

 	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
 	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
 	{
 		if (fragment.trim().length() < 15)
 			return null;
 		int bestSentIndex = -1;
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		Double distBest = 10.0; // + sup
 		String result = null, followSent = null;
 		for (int i = 0; i < sents.length; i++)
 		{
 			String s = sents[i];
 			if (s == null || s.length() < 30)
 				continue;
 			Double distCurr = meas.measureStringDistance(s, fragment);
 			if (distCurr>distBest){
 				distBest = distCurr;
 				bestSentIndex = i;
 			}

 		}
 		if (distBest > 0.4)
 		{
 			result = sents[bestSentIndex];

 			if (bestSentIndex < sents.length - 1 && sents[bestSentIndex + 1].length() > 60)
 			{
 				followSent = sents[bestSentIndex + 1];
 			}

 		}

 		return new String[] { result, followSent };
 	}

 	public static void main(String[] args)
 	{
 		RelatedSentenceFinder f = new RelatedSentenceFinder();

 		List<HitBase> hits = null;
 		try
 		{
 			// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description

 			// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
 			hits = f.generateContentAbout(
 					"Albert Einstein"
 					//"Britney Spears - The Femme Fatale Tour"
 					// "Rush Time Machine",
 					// "Blue Man Group" ,
 					// "Belly Dance With Zaharah",
 					// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
 					// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
 			);
 			System.out.println(HitBase.toString(hits));
 			System.out.println(HitBase.toResultantString(hits));
 			//WordFileGenerator.createWordDoc("Essey about Albert Einstein", hits.get(0).getTitle(), hits);


 		}
 		catch (Exception e)
 		{
 			e.printStackTrace();
 		}

 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.similarity.apps;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.logging.Logger;


	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
	import opennlp.tools.similarity.apps.utils.Utils;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	import org.apache.commons.lang.StringUtils;

	/*
	* This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
	* expected to be readable by humans.
	*
	* These are examples of generated articles, given the article title
	* http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
	* http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
	*
	*/

	public class RelatedSentenceFinder
	{
	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
	PageFetcher pFetcher = new PageFetcher();

	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
	private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

	static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

	// used to indicate that a sentence is an opinion, so more appropriate
	static List<String> MENTAL_VERBS = new ArrayList<String>(Arrays.asList(new String[] { "want", "know", "believe",
	"appeal", "ask", "accept", "agree", "allow", "appeal", "ask", "assume", "believe", "check", "confirm",
	"convince", "deny", "disagree", "explain", "ignore", "inform", "remind", "request", "suggest", "suppose",
	"think", "threaten", "try", "understand" }));

	private static final int MAX_FRAGMENT_SENTS = 10;

	public RelatedSentenceFinder()
	{

	}

	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word, List<String> sents) throws Exception
	{
	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	List<HitBase> searchResult = yrunner.runSearch(word);
	return searchResult;
	}



	public List<HitBase> findRelatedOpinionsForSentence(String sentence, List<String> sents) throws Exception
	{
	List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
	System.out.println(" \n\n=== Sentence = " + sentence);
	List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	for (String query : nounPhraseQueries)
	{
	System.out.println("\nquery = " + query);
	// query += " "+join(MENTAL_VERBS, " OR ") ;
	List<HitBase> searchResult = yrunner.runSearch(query);
	if (searchResult != null)
	{
	for (HitBase item : searchResult)
	{ // got some text from .html
	if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
	{ // exclude
	// pdf
	opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, sents));
	}
	}
	}
	}

	opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
	return opinionSentencesToAdd;
	}

	/**
	* Main content generation function which takes a seed as a person, rock group, or other entity name and produce a list of text fragments by web mining for
	* <br>
	* @param String entity name
	* @return List<HitBase> of text fragment structures which contain approved (in terms of relevance) mined sentences, as well as original search results objects
	* such as doc titles, abstracts, and urls.
	*/

	public List<HitBase> generateContentAbout(String sentence) throws Exception
	{
	List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
	System.out.println(" \n=== Entity to write about = " + sentence);
	List<String> nounPhraseQueries = new ArrayList<String>();


	//nounPhraseQueries.add(sentence + frequentPerformingVerbs);

	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs)
	{
	List<HitBase> searchResult = yrunner.runSearch(sentence + " " + verbAddition);
	if (searchResult != null)
	{
	for (HitBase item : searchResult)
	{ // got some text from .html
	if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
	{ // exclude pdf
	opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, null));
	}
	}
	}
	}

	opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
	return opinionSentencesToAdd;
	}

	/**
	* Takes a sentence and extracts noun phrases and entity names to from search queries for finding relevant sentences on the web, which are
	* then subject to relevance assessment by Similarity. Search queries should not be too general (irrelevant search results) or too specific (too few
	* search results)
	* @param String input sentence to form queries
	* @return List<String> of search expressions
	*/
	public static List<String> buildSearchEngineQueryFromSentence(String sentence)
	{
	ParseTreeChunk matcher = new ParseTreeChunk();
	ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();
	List<List<ParseTreeChunk>> sent1GrpLst = null;

	List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
	List<String> queryArrayStr = new ArrayList<String>();
	for (ParseTreeChunk ch : nPhrases)
	{
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++)
	{
	if (ch.getPOSs().get(i).startsWith("N") \|\| ch.getPOSs().get(i).startsWith("J"))
	{
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2 \|\| len > 5)
	continue;
	if (len < 4)
	{ // every word should start with capital
	String[] qs = query.split(" ");
	boolean bAccept = true;
	for (String w : qs)
	{
	if (w.toLowerCase().equals(w)) // idf only two words then
	// has to be person name,
	// title or geo location
	bAccept = false;
	}
	if (!bAccept)
	continue;
	}

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	if (queryArrayStr.size() < 1)
	{ // release constraints on NP down to 2
	// keywords
	for (ParseTreeChunk ch : nPhrases)
	{
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++)
	{
	if (ch.getPOSs().get(i).startsWith("N") \|\| ch.getPOSs().get(i).startsWith("J"))
	{
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2)
	continue;

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	}

	queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
	queryArrayStr.add(sentence);

	return queryArrayStr;

	}

	/** remove dupes from queries to easy cleaning dupes and repetitive search
	* afterwards
	*
	* @param List<String> of sentences (search queries, or search results abstracts, or titles
	* @return List<String> of sentences where dupes are removed
	*/
	public static List<String> removeDuplicatesFromQueries(List<String> hits)
	{
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	double dupeThresh = 0.8; // if more similar, then considered dupes was
	// 0.7
	List<Integer> idsToRemove = new ArrayList<Integer>();
	List<String> hitsDedup = new ArrayList<String>();
	try
	{
	for (int i = 0; i < hits.size(); i++)
	for (int j = i + 1; j < hits.size(); j++)
	{
	String title1 = hits.get(i);
	String title2 = hits.get(j);
	if (StringUtils.isEmpty(title1) \|\| StringUtils.isEmpty(title2))
	continue;
	if (meas.measureStringDistance(title1, title2) > dupeThresh)
	{
	idsToRemove.add(j); // dupes found, later list member to
	// be deleted

	}
	}

	for (int i = 0; i < hits.size(); i++)
	if (!idsToRemove.contains(i))
	hitsDedup.add(hits.get(i));

	if (hitsDedup.size() < hits.size())
	{
	LOG.info("Removed duplicates from formed query, including " + hits.get(idsToRemove.get(0)));
	}

	}
	catch (Exception e)
	{
	LOG.severe("Problem removing duplicates from query list");
	}

	return hitsDedup;

	}

	/** remove dupes from search results
	*
	* @param List<HitBase> of search results objects
	* @return List<String> of search results objects where dupes are removed
	*/
	public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits)
	{
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	double dupeThresh = //0.8; // if more similar, then considered dupes was
	0.7;
	List<Integer> idsToRemove = new ArrayList<Integer>();
	List<HitBase> hitsDedup = new ArrayList<HitBase>();
	try
	{
	for (int i = 0; i < hits.size(); i++)
	for (int j = i + 1; j < hits.size(); j++)
	{
	HitBase hit2 = hits.get(j);
	List<Fragment> fragmList1 = hits.get(i).getFragments();
	List<Fragment> fragmList2 = hits.get(j).getFragments();
	List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
	for(Fragment f1: fragmList1)
	for(Fragment f2: fragmList2){
	String sf1 = f1.getResultText();
	String sf2 = f2.getResultText();
	if (StringUtils.isEmpty(sf1) \|\| StringUtils.isEmpty(sf1))
	continue;
	if (meas.measureStringDistance(sf1, sf2) > dupeThresh)
	{
	fragmList2Results.remove(f2);
	LOG.info("Removed duplicates from formed fragments list: " + sf2);
	}
	}

	hit2.setFragments(fragmList2Results);
	hits.set(j, hit2 );
	}
	}
	catch (Exception e)
	{
	LOG.severe("Problem removing duplicates from list of fragment");
	}
	return hits;
	}
	/**
	* Takes single search result for an entity which is the subject of the essay to be written and forms essey sentences
	* from the title, abstract, and possibly original page
	* @param HitBase item : search result
	* @param originalSentence : seed for the essay to be written
	* @param sentsAll: list<String> of other sentences in the seed if it is multi-sentence
	* @return search result
	*/

	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, String originalSentence,
	List<String> sentsAll)
	{
	if (sentsAll==null)
	sentsAll = new ArrayList<String>();
	// put orig sentence in structure
	List<String> origs = new ArrayList<String>();
	origs.add(originalSentence);
	item.setOriginalSentences(origs);
	String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace(" ", " ").replace(" ", " ");
	// generation results for this sentence
	List<Fragment> result = new ArrayList<Fragment>();
	// form plain text from snippet
	String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace(" ", " ")
	.replace(" ", " ");

	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
	// fix a template expression which can be substituted by original if
	// relevant
	String snapshotMarked = snapshot.replace("...", " _should_find_orig_ . _should_find_orig_");
	String[] fragments = sm.splitSentences(snapshotMarked);
	List<String> allFragms = new ArrayList<String>();
	allFragms.addAll(Arrays.asList(fragments));

	String[] sents = null; String downloadedPage;
	try
	{
	if (snapshotMarked.length() != snapshot.length())
	{
	downloadedPage = pFetcher.fetchPage(item.getUrl());
	if (downloadedPage != null && downloadedPage.length() > 100)
	{
	item.setPageContent(downloadedPage);
	String pageContent = Utils.fullStripHTML(item.getPageContent());
	pageContent = GeneratedSentenceProcessor.normalizeForSentenceSplitting(pageContent);
	pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")//.replace(" ", ". ")
	.replace("..", ".").replace(". . .", " ")
	.trim(); // sometimes html breaks are converted into ' ' (two spaces), so we need to put '.'
	sents = sm.splitSentences(snapshotMarked);;
	sents = cleanListOfSents(sents);
	}
	}
	}
	catch (Exception e)
	{
	// TODO Auto-generated catch block
	// e.printStackTrace();
	System.err.println("Problem downloading the page and splitting into sentences");
	return item;
	}

	for (String fragment : allFragms)
	{
	String followSent = null;
	if (fragment.length() < 50)
	continue;
	String pageSentence = "";
	// try to find original sentence from webpage
	if (fragment.indexOf("_should_find_orig_") > -1 && sents != null && sents.length > 0)
	try
	{
	String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
	fragment.replace("_should_find_orig_", ""), sents);
	pageSentence = mainAndFollowSent[0];
	followSent = mainAndFollowSent[1];

	}
	catch (Exception e)
	{

	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	else
	// or get original snippet
	pageSentence = fragment;
	if (pageSentence != null)
	pageSentence.replace("_should_find_orig_", "");

	// resultant sentence SHOULD NOT be longer than twice the size of
	// snippet fragment
	if (pageSentence != null && (float) pageSentence.length() / (float) fragment.length() < 4.0)
	{ // was 2.0, but since snippet sentences are rather short now...
	try
	{ // get score from syntactic match between sentence in
	// original text and mined sentence
	double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

	SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence + " " + title, originalSentence);
	List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
	if (!matchRes.isVerbExists() \|\| matchRes.isImperativeVerb())
	{
	System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence);
	continue;
	}

	syntScore =parseTreeChunkListScorer.getParseTreeChunkListScore(match);
	System.out.println(parseTreeChunk.listToString(match) + " " + syntScore
	+ "\n pre-processed sent = '" + pageSentence);

	if (syntScore < 1.5)
	{ // trying other sents
	for (String currSent : sentsAll)
	{
	if (currSent.startsWith(originalSentence))
	continue;
	match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
	double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
	if (syntScoreCurr > syntScore)
	{
	syntScore = syntScoreCurr;
	}
	}
	if (syntScore > 1.5)
	{
	System.out.println("Got match with other sent: " + parseTreeChunk.listToString(match) + " "
	+ syntScore);
	}
	}

	measScore = STRING_DISTANCE_MEASURER.measureStringDistance(originalSentence, pageSentence);

	// now possibly increase score by finding mental verbs
	// indicating opinions
	for (String s : MENTAL_VERBS)
	{
	if (pageSentence.indexOf(s) > -1)
	{
	mentalScore += 0.3;
	break;
	}
	}

	if ((syntScore > 1.5 \|\| measScore > 0.5 \|\| mentalScore > 0.5) && measScore < 0.8
	&& pageSentence.length() > 40) // >70
	{
	String pageSentenceProc = GeneratedSentenceProcessor.acceptableMinedSentence(pageSentence);
	if (pageSentenceProc != null)
	{
	pageSentenceProc = GeneratedSentenceProcessor.processSentence(pageSentenceProc);
	if (followSent != null)
	{
	pageSentenceProc += " " + GeneratedSentenceProcessor.processSentence(followSent);
	}

	pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
	Fragment f = new Fragment(pageSentenceProc, syntScore + measScore + mentalScore
	+ (double) pageSentenceProc.length() / (double) 50);
	f.setSourceURL(item.getUrl());
	f.fragment = fragment;
	result.add(f);
	System.out.println("Accepted sentence: " + pageSentenceProc + "\| with title= " + title);
	System.out.println("For fragment = " + fragment);
	}
	else
	System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence);
	}
	else
	System.out.println("Rejected sentence due to low score: " + pageSentence);
	// }
	}
	catch (Throwable t)
	{
	t.printStackTrace();
	}
	}
	}
	item.setFragments(result);
	return item;
	}

	public static String[] cleanListOfSents(String[] sents)
	{
	List<String> sentsClean = new ArrayList<String>();
	for (String s : sents)
	{
	if (s == null \|\| s.trim().length() < 30 \|\| s.length() < 20)
	continue;
	sentsClean.add(s);
	}
	return (String[]) sentsClean.toArray(new String[0]);
	}

	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
	{
	if (fragment.trim().length() < 15)
	return null;

	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	Double dist = 0.0;
	String result = null, followSent = null;
	for (int i = 0; i < sents.length; i++)
	{
	String s = sents[i];
	if (s == null \|\| s.length() < 30)
	continue;
	Double distCurr = meas.measureStringDistance(s, fragment);
	if (distCurr > dist && distCurr > 0.4)
	{
	result = s;
	dist = distCurr;
	if (i < sents.length - 1 && sents[i + 1].length() > 60)
	{
	followSent = sents[i + 1];
	}

	}
	}
	return new String[] { result, followSent };
	}

	// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
	{
	if (fragment.trim().length() < 15)
	return null;
	int bestSentIndex = -1;
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	Double distBest = 10.0; // + sup
	String result = null, followSent = null;
	for (int i = 0; i < sents.length; i++)
	{
	String s = sents[i];
	if (s == null \|\| s.length() < 30)
	continue;
	Double distCurr = meas.measureStringDistance(s, fragment);
	if (distCurr>distBest){
	distBest = distCurr;
	bestSentIndex = i;
	}

	}
	if (distBest > 0.4)
	{
	result = sents[bestSentIndex];

	if (bestSentIndex < sents.length - 1 && sents[bestSentIndex + 1].length() > 60)
	{
	followSent = sents[bestSentIndex + 1];
	}

	}

	return new String[] { result, followSent };
	}

	public static void main(String[] args)
	{
	RelatedSentenceFinder f = new RelatedSentenceFinder();

	List<HitBase> hits = null;
	try
	{
	// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description

	// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
	hits = f.generateContentAbout(
	"Albert Einstein"
	//"Britney Spears - The Femme Fatale Tour"
	// "Rush Time Machine",
	// "Blue Man Group" ,
	// "Belly Dance With Zaharah",
	// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
	// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
	);
	System.out.println(HitBase.toString(hits));
	System.out.println(HitBase.toResultantString(hits));
	//WordFileGenerator.createWordDoc("Essey about Albert Einstein", hits.get(0).getTitle(), hits);



	}
	catch (Exception e)
	{
	e.printStackTrace();
	}

	}

	}