opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.logging.Logger;

 import opennlp.tools.parse_thicket.Triple;
 import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
 import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunk;
 import opennlp.tools.parse_thicket.apps.SnippetToParagraph.TextChunkComparable;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 import org.apache.commons.lang.StringUtils;

 /*
  * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
  * expected to be readable by humans.
  *
  * These are examples of generated articles, given the article title
  * http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
  * http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
  *
  */

 public class RelatedSentenceFinder {
 	private static Logger LOG = Logger
 			.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
 	PageFetcher pFetcher = new PageFetcher();
 	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
 			.getInstance();
 	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
 	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
 	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
 	protected BingQueryRunner yrunner = new BingQueryRunner();
 	protected int MAX_STEPS = 1;
 	protected int MAX_SEARCH_RESULTS = 1;
 	protected float RELEVANCE_THRESHOLD = 1.1f;
 	protected Set<String> visitedURLs = new HashSet();

 	// used to indicate that a sentence is an opinion, so more appropriate
 	static List<String> MENTAL_VERBS = new ArrayList<String>(
 			Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
 					"accept", "agree", "allow", "appeal", "ask", "assume", "believe",
 					"check", "confirm", "convince", "deny", "disagree", "explain",
 					"ignore", "inform", "remind", "request", "suggest", "suppose",
 					"think", "threaten", "try", "understand" }));

 	private static final int MAX_FRAGMENT_SENTS = 10;

 	public RelatedSentenceFinder(int ms, int msr, float thresh, String key) {
 		this.MAX_STEPS = ms;
 		this.MAX_SEARCH_RESULTS = msr;
 		this.RELEVANCE_THRESHOLD=thresh;
 		yrunner.setKey(key);
 	}

 	public RelatedSentenceFinder() {
 		// TODO Auto-generated constructor stub
 	}
 	public void setLang(String lang) {
 		yrunner.setLang(lang);

 	}
 	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
 			List<String> sents) throws Exception {

 		List<HitBase> searchResult = yrunner.runSearch(word, 100);
 		return searchResult;
 	}

 	public List<HitBase> findRelatedOpinionsForSentence(String sentence,
 			List<String> sents) throws Exception {
 		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
 		System.out.println(" \n\n=== Sentence  = " + sentence);
 		List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

 		BingQueryRunner yrunner = new BingQueryRunner();
 		for (String query : nounPhraseQueries) {
 			System.out.println("\nquery = " + query);
 			// query += " "+join(MENTAL_VERBS, " OR ") ;
 			List<HitBase> searchResult = yrunner.runSearch(query, 100);
 			if (searchResult != null) {
 				for (HitBase item : searchResult) { // got some text from .html
 					if (item.getAbstractText() != null
 							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
 						// pdf
 						opinionSentencesToAdd
 						.add(augmentWithMinedSentencesAndVerifyRelevance(item,
 								sentence, sents));

 					}
 				}
 			}
 		}

 		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
 		return opinionSentencesToAdd;
 	}

 	/**
 	 * Main content generation function which takes a seed as a person, rock
 	 * group, or other entity name and produce a list of text fragments by web
 	 * mining for <br>
 	 *
 	 * @param String
 	 *          entity name
 	 * @return List<HitBase> of text fragment structures which contain approved
 	 *         (in terms of relevance) mined sentences, as well as original search
 	 *         results objects such as doc titles, abstracts, and urls.
 	 */

 	public List<HitBase> generateContentAbout(String sentence) throws Exception {
 		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
 		System.out.println(" \n=== Entity to write about = " + sentence);
 		List<String> nounPhraseQueries = new ArrayList<String>();

 		String[] extraKeywords = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity(sentence);
 		System.out.println("Found  extraKeywords "+ Arrays.asList(extraKeywords));
 		if (extraKeywords==null || extraKeywords.length<1)
 			extraKeywords = StoryDiscourseNavigator.frequentPerformingVerbs;

 		int stepCount=0;
 		for (String verbAddition : extraKeywords) {
 			List<HitBase> searchResult = yrunner.runSearch(sentence + " "
 					+ verbAddition, MAX_SEARCH_RESULTS); //100);
 			if (MAX_SEARCH_RESULTS<searchResult.size())
 				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
 			//TODO for shorter run
 			if (searchResult != null) {
 				for (HitBase item : searchResult) { // got some text from .html
 					if (item.getAbstractText() != null
 							&& !(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) { // exclude pdf
 						opinionSentencesToAdd
 						.add(//augmentWithMinedSentencesAndVerifyRelevance(item,
 							//	sentence, null));
 								buildParagraphOfGeneratedText(item,	sentence, null));
 						visitedURLs.add(item.getUrl());
 					}
 				}
 			}
 			stepCount++;
 			if (stepCount>MAX_STEPS)
 				break;
 		}

 		opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
 		return opinionSentencesToAdd;
 	}

 	/**
 	 * Takes a sentence and extracts noun phrases and entity names to from search
 	 * queries for finding relevant sentences on the web, which are then subject
 	 * to relevance assessment by Similarity. Search queries should not be too
 	 * general (irrelevant search results) or too specific (too few search
 	 * results)
 	 *
 	 * @param String
 	 *          input sentence to form queries
 	 * @return List<String> of search expressions
 	 */
 	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
 		ParseTreeChunk matcher = new ParseTreeChunk();
 		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
 				.getInstance();
 		List<List<ParseTreeChunk>> sent1GrpLst = null;

 		List<ParseTreeChunk> nPhrases = pos
 				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
 		List<String> queryArrayStr = new ArrayList<String>();
 		for (ParseTreeChunk ch : nPhrases) {
 			String query = "";
 			int size = ch.getLemmas().size();

 			for (int i = 0; i < size; i++) {
 				if (ch.getPOSs().get(i).startsWith("N")
 						|| ch.getPOSs().get(i).startsWith("J")) {
 					query += ch.getLemmas().get(i) + " ";
 				}
 			}
 			query = query.trim();
 			int len = query.split(" ").length;
 			if (len < 2 || len > 5)
 				continue;
 			if (len < 4) { // every word should start with capital
 				String[] qs = query.split(" ");
 				boolean bAccept = true;
 				for (String w : qs) {
 					if (w.toLowerCase().equals(w)) // idf only two words then
 						// has to be person name,
 						// title or geo location
 						bAccept = false;
 				}
 				if (!bAccept)
 					continue;
 			}

 			query = query.trim().replace(" ", " +");
 			query = " +" + query;

 			queryArrayStr.add(query);

 		}
 		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
 			// keywords
 			for (ParseTreeChunk ch : nPhrases) {
 				String query = "";
 				int size = ch.getLemmas().size();

 				for (int i = 0; i < size; i++) {
 					if (ch.getPOSs().get(i).startsWith("N")
 							|| ch.getPOSs().get(i).startsWith("J")) {
 						query += ch.getLemmas().get(i) + " ";
 					}
 				}
 				query = query.trim();
 				int len = query.split(" ").length;
 				if (len < 2)
 					continue;

 				query = query.trim().replace(" ", " +");
 				query = " +" + query;

 				queryArrayStr.add(query);

 			}
 		}

 		queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
 		queryArrayStr.add(sentence);

 		return queryArrayStr;

 	}

 	/**
 	 * remove dupes from queries to easy cleaning dupes and repetitive search
 	 * afterwards
 	 *
 	 * @param List
 	 *          <String> of sentences (search queries, or search results
 	 *          abstracts, or titles
 	 * @return List<String> of sentences where dupes are removed
 	 */
 	public static List<String> removeDuplicatesFromQueries(List<String> hits) {
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		double dupeThresh = 0.8; // if more similar, then considered dupes was
 		// 0.7
 		List<Integer> idsToRemove = new ArrayList<Integer>();
 		List<String> hitsDedup = new ArrayList<String>();
 		try {
 			for (int i = 0; i < hits.size(); i++)
 				for (int j = i + 1; j < hits.size(); j++) {
 					String title1 = hits.get(i);
 					String title2 = hits.get(j);
 					if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
 						continue;
 					if (meas.measureStringDistance(title1, title2) > dupeThresh) {
 						idsToRemove.add(j); // dupes found, later list member to
 						// be deleted

 					}
 				}

 			for (int i = 0; i < hits.size(); i++)
 				if (!idsToRemove.contains(i))
 					hitsDedup.add(hits.get(i));

 			if (hitsDedup.size() < hits.size()) {
 				LOG.info("Removed duplicates from formed query, including "
 						+ hits.get(idsToRemove.get(0)));
 			}

 		} catch (Exception e) {
 			LOG.severe("Problem removing duplicates from query list");
 		}

 		return hitsDedup;

 	}

 	/**
 	 * remove dupes from search results
 	 *
 	 * @param List
 	 *          <HitBase> of search results objects
 	 * @return List<String> of search results objects where dupes are removed
 	 */
 	public static List<HitBase> removeDuplicatesFromResultantHits(
 			List<HitBase> hits) {
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		double dupeThresh = // 0.8; // if more similar, then considered dupes was
 				0.7;
 		List<Integer> idsToRemove = new ArrayList<Integer>();
 		List<HitBase> hitsDedup = new ArrayList<HitBase>();
 		try {
 			for (int i = 0; i < hits.size(); i++)
 				for (int j = i + 1; j < hits.size(); j++) {
 					HitBase hit2 = hits.get(j);
 					List<Fragment> fragmList1 = hits.get(i).getFragments();
 					List<Fragment> fragmList2 = hits.get(j).getFragments();
 					List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
 					for (Fragment f1 : fragmList1)
 						for (Fragment f2 : fragmList2) {
 							String sf1 = f1.getResultText();
 							String sf2 = f2.getResultText();
 							if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
 								continue;
 							if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
 								fragmList2Results.remove(f2);
 								LOG.info("Removed duplicates from formed fragments list: "
 										+ sf2);
 							}
 						}

 					hit2.setFragments(fragmList2Results);
 					hits.set(j, hit2);
 				}
 		} catch (Exception e) {
 			LOG.severe("Problem removing duplicates from list of fragment");
 		}
 		return hits;
 	}

 	/**
 	 * Takes single search result for an entity which is the subject of the essay
 	 * to be written and forms essey sentences from the title, abstract, and
 	 * possibly original page
 	 *
 	 * @param HitBase
 	 *          item : search result
 	 * @param originalSentence
 	 *          : seed for the essay to be written
 	 * @param sentsAll
 	 *          : list<String> of other sentences in the seed if it is
 	 *          multi-sentence
 	 * @return search result
 	 */

 	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
 			String originalSentence, List<String> sentsAll) {
 		if (sentsAll == null)
 			sentsAll = new ArrayList<String>();
 		// put orig sentence in structure
 		List<String> origs = new ArrayList<String>();
 		origs.add(originalSentence);
 		item.setOriginalSentences(origs);
 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
 				.replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<Fragment> result = new ArrayList<Fragment>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ")
 				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");


 		// fix a template expression which can be substituted by original if
 		// relevant
 		String snapshotMarked = snapshot.replace("...",
 				" _should_find_orig_ . _should_find_orig_");
 		String[] fragments = sm.splitSentences(snapshotMarked);
 		List<String> allFragms = new ArrayList<String>();
 		allFragms.addAll(Arrays.asList(fragments));

 		String[] sents = null;
 		String downloadedPage = null;
 		try {
 			if (snapshotMarked.length() != snapshot.length()) {
 				downloadedPage = pFetcher.fetchPage(item.getUrl());
 				if (downloadedPage != null && downloadedPage.length() > 100) {
 					item.setPageContent(downloadedPage);
 					String pageContent = Utils.fullStripHTML(item.getPageContent());
 					pageContent = GeneratedSentenceProcessor
 							.normalizeForSentenceSplitting(pageContent);
 					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
 					//pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
 					//		// ". ")
 					//		.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
 					// we need to put '.'
 					sents = sm.splitSentences(pageContent);

 					sents = ContentGeneratorSupport.cleanListOfSents(sents);
 				}
 			}
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			// e.printStackTrace();
 			System.err
 			.println("Problem downloading  the page and splitting into sentences");
 			return item;
 		}

 		for (String fragment : allFragms) {
 			String followSent = "";
 			if (fragment.length() < 50)
 				continue;
 			String pageSentence = "";
 			// try to find original sentence from webpage
 			if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
 					&& sents.length > 0){
 				try {
 					// first try sorted sentences from page by length approach
 					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
 					String[] mainAndFollowSent = null;

 					try {
 						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 								fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
 					} catch (Exception e) {
 						// TODO Auto-generated catch block
 						e.printStackTrace();
 					}
 					// if the above gives null than try to match all sentences from snippet fragment
 					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
 						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 								fragment.replace("_should_find_orig_", ""), sents);
 					}

 					if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){
 						pageSentence = mainAndFollowSent[0];
 						for(int i = 1; i< mainAndFollowSent.length; i++)
 							if (mainAndFollowSent[i]!=null)
 								followSent+= mainAndFollowSent[i];
 					}

 				} catch (Exception e) {

 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 			}

 			else
 				// or get original snippet
 				pageSentence = fragment;
 			if (pageSentence != null)
 				pageSentence.replace("_should_find_orig_", "");

 			// resultant sentence SHOULD NOT be longer than for times the size of
 			// snippet fragment
 			if (pageSentence != null && pageSentence.length()>50 )
 			//		&& (float) pageSentence.length() / (float) fragment.length() < 4.0)
 			{ // was 2.0,

 				try { // get score from syntactic match between sentence in
 					// original text and mined sentence
 					double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

 					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
 							+ " " + title, originalSentence);
 					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
 					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
 						System.out
 						.println("Rejected Sentence : No verb OR Yes imperative verb :"
 								+ pageSentence);
 						continue;
 					}

 					syntScore = parseTreeChunkListScorer
 							.getParseTreeChunkListScore(match);
 					System.out.println(parseTreeChunk.listToString(match) + " "
 							+ syntScore + "\n pre-processed sent = '" + pageSentence);

 					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
 						for (String currSent : sentsAll) {
 							if (currSent.startsWith(originalSentence))
 								continue;
 							match = sm.assessRelevance(currSent, pageSentence)
 									.getMatchResult();
 							double syntScoreCurr = parseTreeChunkListScorer
 									.getParseTreeChunkListScore(match);
 							if (syntScoreCurr > syntScore) {
 								syntScore = syntScoreCurr;
 							}
 						}
 						if (syntScore > RELEVANCE_THRESHOLD) {
 							System.out.println("Got match with other sent: "
 									+ parseTreeChunk.listToString(match) + " " + syntScore);
 						}
 					}

 					measScore = stringDistanceMeasurer.measureStringDistance(
 							originalSentence, pageSentence);


 					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
 							&& measScore < 0.8 && pageSentence.length() > 40) // >70
 					{
 						String pageSentenceProc = GeneratedSentenceProcessor
 								.acceptableMinedSentence(pageSentence);
 						if (pageSentenceProc != null) {
 							pageSentenceProc = GeneratedSentenceProcessor
 									.processSentence(pageSentenceProc);
 							followSent = GeneratedSentenceProcessor.processSentence(followSent);
 							if (followSent != null) {
 								pageSentenceProc += " "+ followSent;
 							}

 							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
 							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
 									+ mentalScore + (double) pageSentenceProc.length()
 									/ (double) 50);
 							f.setSourceURL(item.getUrl());
 							f.fragment = fragment;
 							result.add(f);
 							System.out.println("Accepted sentence: " + pageSentenceProc + " | "+followSent
 									+ "| with title= " + title);
 							System.out.println("For fragment = " + fragment);
 						} else
 							System.out
 							.println("Rejected sentence due to wrong area at webpage: "
 									+ pageSentence);
 					} else
 						System.out.println("Rejected sentence due to low score: "
 								+ pageSentence);
 					// }
 				} catch (Throwable t) {
 					t.printStackTrace();
 				}
 			}
 		}
 		item.setFragments(result);
 		return item;
 	}


 	// given a fragment from snippet, finds an original sentence at a webpage by
 	// optimizing alignmemt score
 	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
 			String fragment, String[] sents) {
 		if (fragment.trim().length() < 15)
 			return null;

 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		Double dist = 0.0;
 		String result = null, followSent = "";
 		for (int i = 0; i < sents.length; i++) {
 			String s = sents[i];
 			if (s == null || s.length() < 30)
 				continue;
 			Double distCurr = meas.measureStringDistance(s, fragment);
 			if (distCurr > dist && distCurr > 0.4) {
 				result = s;
 				dist = distCurr;
 				try {
 					if (i < sents.length - 1 && sents[i + 1].length() > 60) {
 						String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]);
 						if (f1!=null){
 							followSent = f1;
 						}
 					}

 					if (i < sents.length - 2 && sents[i + 2].length() > 60) {
 						String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]);
 						if (f2!=null){
 							followSent += " "+f2;
 						}
 					}
 				} catch (Exception e) {
 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 			}
 		}
 		return new String[] { result, followSent };
 	}

 	// given a fragment from snippet, finds an original sentence at a webpage by
 	// optimizing alignmemt score
 	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
 			String fragment, String[] sents) {
 		if (fragment.trim().length() < 15)
 			return null;
 		int bestSentIndex = -1;
 		StringDistanceMeasurer meas = new StringDistanceMeasurer();
 		Double distBest = 10.0; // + sup
 		String result = null, followSent = null;
 		for (int i = 0; i < sents.length; i++) {
 			String s = sents[i];
 			if (s == null || s.length() < 30)
 				continue;
 			Double distCurr = meas.measureStringDistance(s, fragment);
 			if (distCurr > distBest) {
 				distBest = distCurr;
 				bestSentIndex = i;
 			}

 		}
 		if (distBest > 0.4) {
 			result = sents[bestSentIndex];

 			if (bestSentIndex < sents.length - 1
 					&& sents[bestSentIndex + 1].length() > 60) {
 				followSent = sents[bestSentIndex + 1];
 			}

 		}

 		return new String[] { result, followSent };
 	}

 	public String[] extractSentencesFromPage(String downloadedPage)
 	{

 		int maxSentsFromPage= 100;
 		List<String[]> results = new ArrayList<String[]>();

 		//String pageOrigHTML = pFetcher.fetchOrigHTML(url);

 		downloadedPage= downloadedPage.replace("     ", "&");
 		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
 		String[] sents = downloadedPage.split("#");
 		List<TextChunk> sentsList = new ArrayList<TextChunk>();
 		for(String s: sents){
 			s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s);
 		/*	s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 					.replace(": ", ". ").replace("- ", ". ").
 					replace (". .",".").trim(); */
 			sentsList.add(new TextChunk(s, s.length()));
 		}

 		Collections.sort(sentsList, new TextChunkComparable());
 		String[] longestSents = new String[maxSentsFromPage];
 		int j=0;
 		int initIndex = sentsList.size()-1 -maxSentsFromPage;
 		if (initIndex<0)
 			initIndex = 0;
 		for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){
 			longestSents[j] = sentsList.get(i).text;
 			j++;
 		}

 		sents = cleanSplitListOfSents(longestSents);

 		//sents = removeDuplicates(sents);
 		//sents = verifyEnforceStartsUpperCase(sents);

 		return sents;
 	}

 	public class TextChunk {
 		public TextChunk(String s, int length) {
 			this.text = s;
 			this.len = length;
 		}
 		public String text;
 		public int len;
 	}

 	public class TextChunkComparable implements Comparator<TextChunk>
 	{
 		public int compare(TextChunk ch1, TextChunk ch2)
 		{
 			if (ch1.len>ch2.len)
 				return 1;
 			else if (ch1.len<ch2.len)
 				return  -1;
 			else return 0;

 		}
 	}

 	protected String[] cleanSplitListOfSents(String[] longestSents){
 		float minFragmentLength = 40, minFragmentLengthSpace=4;

 		List<String> sentsClean = new ArrayList<String>();
 		for (String sentenceOrMultSent : longestSents)
 		{
 			if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20)
 				continue;
 			if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
 				//System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
 				continue;
 			}
 			// aaa. hhh hhh.  kkk . kkk ll hhh. lll kkk n.
 			int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
 			float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
 			if ( avgSentenceLengthInTextPortion<minFragmentLength)
 				continue;
 			// o oo o ooo o o o ooo oo ooo o o oo
 			numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
 			avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
 			if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
 				continue;

 			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);

 			// forced split by ',' somewhere in the middle of sentence
 			// disused - Feb 26 13
 			//furtherSplit = furtherMakeSentencesShorter(furtherSplit);
 			furtherSplit.remove(furtherSplit.size()-1);
 			for(String s : furtherSplit){
 				if (s.indexOf('|')>-1)
 					continue;
 				s = s.replace("<em>"," ").replace("</em>"," ");
 				s = Utils.convertToASCII(s);
 				sentsClean.add(s);
 			}
 		}
 		return (String[]) sentsClean.toArray(new String[0]);
 	}

 	public Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
 		if (sentsAll == null)
 			sentsAll = new ArrayList<String>();
 		// put orig sentence in structure
 		List<String> origs = new ArrayList<String>();
 		origs.add(originalSentence);
 		item.setOriginalSentences(origs);
 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
 				.replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<Fragment> result = new ArrayList<Fragment>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ")
 				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");


 		// fix a template expression which can be substituted by original if
 		// relevant
 		String snapshotMarked = snapshot.replace("...",
 				" _should_find_orig_ . _should_find_orig_");
 		String[] fragments = sm.splitSentences(snapshotMarked);
 		List<String> allFragms = new ArrayList<String>();
 		allFragms.addAll(Arrays.asList(fragments));

 		String[] sents = null;
 		String downloadedPage = null;
 		try {
 			if (snapshotMarked.length() != snapshot.length()) {
 				downloadedPage = pFetcher.fetchPage(item.getUrl());
 				if (downloadedPage != null && downloadedPage.length() > 100) {
 					item.setPageContent(downloadedPage);
 					String pageContent = Utils.fullStripHTML(item.getPageContent());
 					pageContent = GeneratedSentenceProcessor
 							.normalizeForSentenceSplitting(pageContent);
 					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
 					//pageContent = pageContent.trim().replaceAll("    [A-Z]", ". $0")// .replace("  ",
 					//		// ". ")
 					//		.replace("..", ".").replace(". . .", " ").
 					//		replace(".    .",". ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
 					// we need to put '.'
 					sents = sm.splitSentences(pageContent);

 					sents = ContentGeneratorSupport.cleanListOfSents(sents);
 				}
 			}
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			// e.printStackTrace();
 			System.err
 			.println("Problem downloading  the page and splitting into sentences");
 			return new Triple(allFragms, downloadedPage, sents);
 		}
 		return new Triple(allFragms, downloadedPage, sents);
 	}

 	String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
 		String[] mainAndFollowSent = null;

 		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
 		String downloadedPage = (String)fragmentExtractionResults.getSecond();
 		String[] sents = (String[])fragmentExtractionResults.getThird();

 		String followSent = null;
 		if (fragment.length() < 50)
 			return null;
 		String pageSentence = "";
 		// try to find original sentence from webpage
 		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
 				&& sents.length > 0){
 			try {
 				// first try sorted sentences from page by length approach
 				String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);


 				try {
 					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
 				} catch (Exception e) {
 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 				// if the above gives null than try to match all sentences from snippet fragment
 				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
 					mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sents);
 				}


 			} catch (Exception e) {

 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
 		}
 		else
 			// or get original snippet
 			pageSentence = fragment;
 		if (pageSentence != null)
 			pageSentence.replace("_should_find_orig_", "");

 		return mainAndFollowSent;

 	}

 	private Fragment verifyCandidateSentencesAndFormParagraph(
 			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
 		Fragment result = null;

 		String pageSentence = candidateSentences[0];
 		String followSent = "";
 		for(int i = 1; i< candidateSentences.length; i++)
 			followSent+= candidateSentences[i];
 		String title = item.getTitle();

 		// resultant sentence SHOULD NOT be longer than for times the size of
 		// snippet fragment
 		if (!(pageSentence != null && pageSentence.length()>50) ){
 			System.out.println("Cannot accept the sentence = "+ pageSentence +
 					"!(pageSentence != null && pageSentence.length()>50 && (float) pageSentence.length() / (float) fragment.length() < 4.0) )");

 			return null;
 		}


 		try { // get score from syntactic match between sentence in
 			// original text and mined sentence
 			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

 			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
 					+ " " + title, originalSentence);
 			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
 			if (match==null || match.size()<1){
 				System.out
 				.println("Rejected Sentence : empty match "+ pageSentence);
 				return null;
 			}

 			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
 				System.out
 				.println("Rejected Sentence : No verb OR Yes imperative verb :"
 						+ pageSentence);
 				return null;
 			}

 			syntScore = parseTreeChunkListScorer
 					.getParseTreeChunkListScore(match);
 			System.out.println(parseTreeChunk.listToString(match) + " "
 					+ syntScore + "\n pre-processed sent = '" + pageSentence);

 			try {
 				if (sentsAll!=null && syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
 					for (String currSent : sentsAll) {
 						if (currSent.startsWith(originalSentence))
 							continue;
 						match = sm.assessRelevance(currSent, pageSentence)
 								.getMatchResult();
 						double syntScoreCurr = parseTreeChunkListScorer
 								.getParseTreeChunkListScore(match);
 						if (syntScoreCurr > syntScore) {
 							syntScore = syntScoreCurr;
 						}
 					}
 					if (syntScore > RELEVANCE_THRESHOLD) {
 						System.out.println("Got match with other sent: "
 								+ parseTreeChunk.listToString(match) + " " + syntScore);
 					}
 				}
 			} catch (Exception e) {
 				e.printStackTrace();
 			}

 			measScore = stringDistanceMeasurer.measureStringDistance(
 					originalSentence, pageSentence);


 			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
 					&& measScore < 0.8 && pageSentence.length() > 40) // >70
 			{
 				String pageSentenceProc = GeneratedSentenceProcessor
 						.acceptableMinedSentence(pageSentence);
 				if (pageSentenceProc != null) {
 					pageSentenceProc = GeneratedSentenceProcessor
 							.processSentence(pageSentenceProc);
 					followSent = GeneratedSentenceProcessor.processSentence(followSent);
 					if (followSent != null) {
 						pageSentenceProc += " "+ followSent;
 					}

 					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
 					result = new Fragment(pageSentenceProc, syntScore + measScore
 							+ mentalScore + (double) pageSentenceProc.length()
 							/ (double) 50);
 					result.setSourceURL(item.getUrl());
 					result.fragment = fragment;

 					System.out.println("Accepted sentence: " + pageSentenceProc
 							+ "| with title= " + title);
 					System.out.println("For fragment = " + fragment);
 				} else
 					System.out
 					.println("Rejected sentence due to wrong area at webpage: "
 							+ pageSentence);
 			} else
 				System.out.println("Rejected sentence due to low score: "
 						+ pageSentence);
 			// }
 		} catch (Throwable t) {
 			t.printStackTrace();
 		}

 	return result;
 }

 public HitBase buildParagraphOfGeneratedText(HitBase item,
 		String originalSentence, List<String> sentsAll) {
 	List<Fragment> results = new ArrayList<Fragment>() ;

 	Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);

 	List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
 	String downloadedPage = (String)fragmentExtractionResults.getSecond();
 	String[] sents = (String[])fragmentExtractionResults.getThird();

 	for (String fragment : allFragms) {
 		String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
 		if (candidateSentences == null)
 			continue;
 		Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
 		if (res!=null)
 			results.add(res);

 	}

 	item.setFragments(results );
 	return item;
 }


 public static void main(String[] args) {
 	RelatedSentenceFinder f = new RelatedSentenceFinder();

 	List<HitBase> hits = null;
 	try {
 		// uncomment the sentence you would like to serve as a seed sentence for
 		// content generation for an event description

 		// uncomment the sentence you would like to serve as a seed sentence for
 		// content generation for an event description
 		hits = f.generateContentAbout("Albert Einstein"
 				// "Britney Spears - The Femme Fatale Tour"
 				// "Rush Time Machine",
 				// "Blue Man Group" ,
 				// "Belly Dance With Zaharah",
 				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
 				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
 				);
 		System.out.println(HitBase.toString(hits));
 		System.out.println(HitBase.toResultantString(hits));
 		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
 		// hits.get(0).getTitle(), hits);

 	} catch (Exception e) {
 		e.printStackTrace();
 	}

 }


 }