opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Logger;

 import opennlp.tools.parse_thicket.Triple;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 /*
  * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine
  * them in the form
  * expected to be readable by humans and not distinguishable from genuine content by search engines
  *
  */

 public class ContentGenerator /*extends RelatedSentenceFinder*/ {
 	private static Logger LOG = Logger
 			.getLogger("opennlp.tools.similarity.apps.ContentGenerator");
 	PageFetcher pFetcher = new PageFetcher();
 	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
 			.getInstance();
 	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
 	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
 	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
 	protected BingQueryRunner yrunner = new BingQueryRunner();
 	protected ContentGeneratorSupport support = new ContentGeneratorSupport();
 	protected int MAX_STEPS = 1;
 	protected int MAX_SEARCH_RESULTS = 1;
 	protected float RELEVANCE_THRESHOLD = 1.1f;

 	//private static final int MAX_FRAGMENT_SENTS = 10;

 	public ContentGenerator(int ms, int msr, float thresh, String key) {
 		this.MAX_STEPS = ms;
 		this.MAX_SEARCH_RESULTS = msr;
 		this.RELEVANCE_THRESHOLD=thresh;
 		yrunner.setKey(key);
 	}

 	public ContentGenerator() {
 		// TODO Auto-generated constructor stub
 	}
 	public void setLang(String lang) {
 		yrunner.setLang(lang);

 	}


 	/**
 	 * Main content generation function which takes a seed as a person, rock
 	 * group, or other entity name and produce a list of text fragments by web
 	 * mining for <br>
 	 *
 	 * @param String
 	 *          entity name
 	 * @return List<HitBase> of text fragment structures which contain approved
 	 *         (in terms of relevance) mined sentences, as well as original search
 	 *         results objects such as doc titles, abstracts, and urls.
 	 */

 	public List<HitBase> generateContentAbout(String sentence) throws Exception {
 		List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
 		System.out.println(" \n=== Entity to write about = " + sentence);

 		int stepCount=0;
 		for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
 			List<HitBase> searchResult = yrunner.runSearch(sentence + " "
 					+ verbAddition, MAX_SEARCH_RESULTS); //100);
 			if (MAX_SEARCH_RESULTS<searchResult.size())
 				searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
 			//TODO for shorter run
 			if (searchResult != null) {
 				for (HitBase item : searchResult) { // got some text from .html
 					if (item.getAbstractText() != null
 							&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
 						opinionSentencesToAdd
 						.add(buildParagraphOfGeneratedText(item, sentence, null));
 					}
 				}
 			}
 			stepCount++;
 			if (stepCount>MAX_STEPS)
 				break;
 		}

 		opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);
 		return opinionSentencesToAdd;
 	}

 	/**
 	 * Takes a sentence and extracts noun phrases and entity names to from search
 	 * queries for finding relevant sentences on the web, which are then subject
 	 * to relevance assessment by Similarity. Search queries should not be too
 	 * general (irrelevant search results) or too specific (too few search
 	 * results)
 	 *
 	 * @param String
 	 *          input sentence to form queries
 	 * @return List<String> of search expressions
 	 */
 	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
 		ParseTreeChunk matcher = new ParseTreeChunk();
 		ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
 				.getInstance();
 		List<List<ParseTreeChunk>> sent1GrpLst = null;

 		List<ParseTreeChunk> nPhrases = pos
 				.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
 		List<String> queryArrayStr = new ArrayList<String>();
 		for (ParseTreeChunk ch : nPhrases) {
 			String query = "";
 			int size = ch.getLemmas().size();

 			for (int i = 0; i < size; i++) {
 				if (ch.getPOSs().get(i).startsWith("N")
 						|| ch.getPOSs().get(i).startsWith("J")) {
 					query += ch.getLemmas().get(i) + " ";
 				}
 			}
 			query = query.trim();
 			int len = query.split(" ").length;
 			if (len < 2 || len > 5)
 				continue;
 			if (len < 4) { // every word should start with capital
 				String[] qs = query.split(" ");
 				boolean bAccept = true;
 				for (String w : qs) {
 					if (w.toLowerCase().equals(w)) // idf only two words then
 						// has to be person name,
 						// title or geo location
 						bAccept = false;
 				}
 				if (!bAccept)
 					continue;
 			}

 			query = query.trim().replace(" ", " +");
 			query = " +" + query;

 			queryArrayStr.add(query);

 		}
 		if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
 			// keywords
 			for (ParseTreeChunk ch : nPhrases) {
 				String query = "";
 				int size = ch.getLemmas().size();

 				for (int i = 0; i < size; i++) {
 					if (ch.getPOSs().get(i).startsWith("N")
 							|| ch.getPOSs().get(i).startsWith("J")) {
 						query += ch.getLemmas().get(i) + " ";
 					}
 				}
 				query = query.trim();
 				int len = query.split(" ").length;
 				if (len < 2)
 					continue;

 				query = query.trim().replace(" ", " +");
 				query = " +" + query;

 				queryArrayStr.add(query);

 			}
 		}

 		queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);
 		queryArrayStr.add(sentence);

 		return queryArrayStr;

 	}

 	private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
 		if (sentsAll == null)
 			sentsAll = new ArrayList<String>();
 		// put orig sentence in structure
 		List<String> origs = new ArrayList<String>();
 		origs.add(originalSentence);
 		item.setOriginalSentences(origs);
 		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
 				.replace("  ", " ").replace("  ", " ");
 		// generation results for this sentence
 		List<Fragment> result = new ArrayList<Fragment>();
 		// form plain text from snippet
 		String snapshot = item.getAbstractText().replace("<b>", " ")
 				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");


 		// fix a template expression which can be substituted by original if
 		// relevant
 		String snapshotMarked = snapshot.replace("...",
 				" _should_find_orig_ . _should_find_orig_");
 		String[] fragments = sm.splitSentences(snapshotMarked);
 		List<String> allFragms = new ArrayList<String>();
 		allFragms.addAll(Arrays.asList(fragments));

 		String[] sents = null;
 		String downloadedPage = null;
 		try {
 			if (snapshotMarked.length() != snapshot.length()) {
 				downloadedPage = pFetcher.fetchPage(item.getUrl());
 				if (downloadedPage != null && downloadedPage.length() > 100) {
 					item.setPageContent(downloadedPage);
 					String pageContent = Utils.fullStripHTML(item.getPageContent());
 					pageContent = GeneratedSentenceProcessor
 							.normalizeForSentenceSplitting(pageContent);
 					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);

 					sents = sm.splitSentences(pageContent);

 					sents = ContentGeneratorSupport.cleanListOfSents(sents);
 				}
 			}
 		} catch (Exception e) {
 			// TODO Auto-generated catch block
 			// e.printStackTrace();
 			System.err
 			.println("Problem downloading  the page and splitting into sentences");
 			return new Triple(allFragms, downloadedPage, sents);
 		}
 		return new Triple(allFragms, downloadedPage, sents);
 	}

 	private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
 		String[] mainAndFollowSent = null;

 		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
 		String downloadedPage = (String)fragmentExtractionResults.getSecond();
 		String[] sents = (String[])fragmentExtractionResults.getThird();

 		String followSent = null;
 		if (fragment.length() < 50)
 			return null;
 		String pageSentence = "";
 		// try to find original sentence from webpage
 		if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
 				&& sents.length > 0){
 			try {
 				// first try sorted sentences from page by length approach
 				String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);


 				try {
 					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
 				} catch (Exception e) {
 					// TODO Auto-generated catch block
 					e.printStackTrace();
 				}
 				// if the above gives null than try to match all sentences from snippet fragment
 				if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
 					mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
 							fragment.replace("_should_find_orig_", ""), sents);
 				}


 			} catch (Exception e) {

 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
 		}
 		else
 			// or get original snippet
 			pageSentence = fragment;
 		if (pageSentence != null)
 			pageSentence.replace("_should_find_orig_", "");

 		return mainAndFollowSent;

 	}

 	private Fragment verifyCandidateSentencesAndFormParagraph(
 			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
 		Fragment result = null;

 		String pageSentence = candidateSentences[0];
 		String followSent = "";
 		for(int i = 1; i< candidateSentences.length; i++)
 			followSent+= candidateSentences[i];
 		String title = item.getTitle();

 		// resultant sentence SHOULD NOT be longer than for times the size of
 		// snippet fragment
 		if (!(pageSentence != null && pageSentence.length()>50
 				&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )
 			return null;


 		try { // get score from syntactic match between sentence in
 			// original text and mined sentence
 			double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

 			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
 					+ " " + title, originalSentence);
 			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
 			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
 				System.out
 				.println("Rejected Sentence : No verb OR Yes imperative verb :"
 						+ pageSentence);
 				return null;
 			}

 			syntScore = parseTreeChunkListScorer
 					.getParseTreeChunkListScore(match);
 			System.out.println(parseTreeChunk.listToString(match) + " "
 					+ syntScore + "\n pre-processed sent = '" + pageSentence);

 			if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
 				for (String currSent : sentsAll) {
 					if (currSent.startsWith(originalSentence))
 						continue;
 					match = sm.assessRelevance(currSent, pageSentence)
 							.getMatchResult();
 					double syntScoreCurr = parseTreeChunkListScorer
 							.getParseTreeChunkListScore(match);
 					if (syntScoreCurr > syntScore) {
 						syntScore = syntScoreCurr;
 					}
 				}
 				if (syntScore > RELEVANCE_THRESHOLD) {
 					System.out.println("Got match with other sent: "
 							+ parseTreeChunk.listToString(match) + " " + syntScore);
 				}
 			}

 			measScore = stringDistanceMeasurer.measureStringDistance(
 					originalSentence, pageSentence);


 			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
 					&& measScore < 0.8 && pageSentence.length() > 40) // >70
 			{
 				String pageSentenceProc = GeneratedSentenceProcessor
 						.acceptableMinedSentence(pageSentence);
 				if (pageSentenceProc != null) {
 					pageSentenceProc = GeneratedSentenceProcessor
 							.processSentence(pageSentenceProc);
 					followSent = GeneratedSentenceProcessor.processSentence(followSent);
 					if (followSent != null) {
 						pageSentenceProc += " "+ followSent;
 					}

 					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
 					result = new Fragment(pageSentenceProc, syntScore + measScore
 							+ mentalScore + (double) pageSentenceProc.length()
 							/ (double) 50);
 					result.setSourceURL(item.getUrl());
 					result.fragment = fragment;

 					System.out.println("Accepted sentence: " + pageSentenceProc
 							+ "| with title= " + title);
 					System.out.println("For fragment = " + fragment);
 				} else
 					System.out
 					.println("Rejected sentence due to wrong area at webpage: "
 							+ pageSentence);
 			} else
 				System.out.println("Rejected sentence due to low score: "
 						+ pageSentence);
 			// }
 		} catch (Throwable t) {
 			t.printStackTrace();
 		}

 	return result;
 }
 	/**
 	 * Takes single search result for an entity which is the subject of the essay
 	 * to be written and forms essey sentences from the title, abstract, and
 	 * possibly original page
 	 *
 	 * @param HitBase
 	 *          item : search result
 	 * @param originalSentence
 	 *          : seed for the essay to be written
 	 * @param sentsAll
 	 *          : list<String> of other sentences in the seed if it is
 	 *          multi-sentence
 	 * @return search result
 	 */
 	public HitBase buildParagraphOfGeneratedText(HitBase item,
 			String originalSentence, List<String> sentsAll) {
 		List<Fragment> results = new ArrayList<Fragment>() ;

 		Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);

 		List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
 		String downloadedPage = (String)fragmentExtractionResults.getSecond();
 		String[] sents = (String[])fragmentExtractionResults.getThird();

 		for (String fragment : allFragms) {
 			String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
 			if (candidateSentences == null)
 				continue;
 			Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
 			if (res!=null)
 				results.add(res);

 		}

 		item.setFragments(results );
 		return item;
 	}


 public static void main(String[] args) {
 	ContentGenerator f = new ContentGenerator();

 	List<HitBase> hits = null;
 	try {
 		// uncomment the sentence you would like to serve as a seed sentence for
 		// content generation for an event description

 		// uncomment the sentence you would like to serve as a seed sentence for
 		// content generation for an event description
 		hits = f.generateContentAbout("Albert Einstein"
 				// "Britney Spears - The Femme Fatale Tour"
 				// "Rush Time Machine",
 				// "Blue Man Group" ,
 				// "Belly Dance With Zaharah",
 				// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
 				// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
 				);
 		System.out.println(HitBase.toString(hits));
 		System.out.println(HitBase.toResultantString(hits));
 		// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
 		// hits.get(0).getTitle(), hits);

 	} catch (Exception e) {
 		e.printStackTrace();
 	}

 }


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.similarity.apps;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.logging.Logger;

	import opennlp.tools.parse_thicket.Triple;
	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
	import opennlp.tools.similarity.apps.utils.Utils;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	/*
	* This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine
	* them in the form
	* expected to be readable by humans and not distinguishable from genuine content by search engines
	*
	*/

	public class ContentGenerator /extends RelatedSentenceFinder/ {
	private static Logger LOG = Logger
	.getLogger("opennlp.tools.similarity.apps.ContentGenerator");
	PageFetcher pFetcher = new PageFetcher();
	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
	.getInstance();
	protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
	protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
	protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
	protected BingQueryRunner yrunner = new BingQueryRunner();
	protected ContentGeneratorSupport support = new ContentGeneratorSupport();
	protected int MAX_STEPS = 1;
	protected int MAX_SEARCH_RESULTS = 1;
	protected float RELEVANCE_THRESHOLD = 1.1f;

	//private static final int MAX_FRAGMENT_SENTS = 10;

	public ContentGenerator(int ms, int msr, float thresh, String key) {
	this.MAX_STEPS = ms;
	this.MAX_SEARCH_RESULTS = msr;
	this.RELEVANCE_THRESHOLD=thresh;
	yrunner.setKey(key);
	}

	public ContentGenerator() {
	// TODO Auto-generated constructor stub
	}
	public void setLang(String lang) {
	yrunner.setLang(lang);

	}


	/**
	* Main content generation function which takes a seed as a person, rock
	* group, or other entity name and produce a list of text fragments by web
	* mining for <br>
	*
	* @param String
	* entity name
	* @return List<HitBase> of text fragment structures which contain approved
	* (in terms of relevance) mined sentences, as well as original search
	* results objects such as doc titles, abstracts, and urls.
	*/

	public List<HitBase> generateContentAbout(String sentence) throws Exception {
	List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
	System.out.println(" \n=== Entity to write about = " + sentence);

	int stepCount=0;
	for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
	List<HitBase> searchResult = yrunner.runSearch(sentence + " "
	+ verbAddition, MAX_SEARCH_RESULTS); //100);
	if (MAX_SEARCH_RESULTS<searchResult.size())
	searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
	//TODO for shorter run
	if (searchResult != null) {
	for (HitBase item : searchResult) { // got some text from .html
	if (item.getAbstractText() != null
	&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
	opinionSentencesToAdd
	.add(buildParagraphOfGeneratedText(item, sentence, null));
	}
	}
	}
	stepCount++;
	if (stepCount>MAX_STEPS)
	break;
	}

	opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);
	return opinionSentencesToAdd;
	}

	/**
	* Takes a sentence and extracts noun phrases and entity names to from search
	* queries for finding relevant sentences on the web, which are then subject
	* to relevance assessment by Similarity. Search queries should not be too
	* general (irrelevant search results) or too specific (too few search
	* results)
	*
	* @param String
	* input sentence to form queries
	* @return List<String> of search expressions
	*/
	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
	ParseTreeChunk matcher = new ParseTreeChunk();
	ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
	.getInstance();
	List<List<ParseTreeChunk>> sent1GrpLst = null;

	List<ParseTreeChunk> nPhrases = pos
	.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
	List<String> queryArrayStr = new ArrayList<String>();
	for (ParseTreeChunk ch : nPhrases) {
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++) {
	if (ch.getPOSs().get(i).startsWith("N")
	\|\| ch.getPOSs().get(i).startsWith("J")) {
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2 \|\| len > 5)
	continue;
	if (len < 4) { // every word should start with capital
	String[] qs = query.split(" ");
	boolean bAccept = true;
	for (String w : qs) {
	if (w.toLowerCase().equals(w)) // idf only two words then
	// has to be person name,
	// title or geo location
	bAccept = false;
	}
	if (!bAccept)
	continue;
	}

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
	// keywords
	for (ParseTreeChunk ch : nPhrases) {
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++) {
	if (ch.getPOSs().get(i).startsWith("N")
	\|\| ch.getPOSs().get(i).startsWith("J")) {
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2)
	continue;

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	}

	queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);
	queryArrayStr.add(sentence);

	return queryArrayStr;

	}

	private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
	if (sentsAll == null)
	sentsAll = new ArrayList<String>();
	// put orig sentence in structure
	List<String> origs = new ArrayList<String>();
	origs.add(originalSentence);
	item.setOriginalSentences(origs);
	String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
	.replace(" ", " ").replace(" ", " ");
	// generation results for this sentence
	List<Fragment> result = new ArrayList<Fragment>();
	// form plain text from snippet
	String snapshot = item.getAbstractText().replace("<b>", " ")
	.replace("</b>", " ").replace(" ", " ").replace(" ", " ");


	// fix a template expression which can be substituted by original if
	// relevant
	String snapshotMarked = snapshot.replace("...",
	" _should_find_orig_ . _should_find_orig_");
	String[] fragments = sm.splitSentences(snapshotMarked);
	List<String> allFragms = new ArrayList<String>();
	allFragms.addAll(Arrays.asList(fragments));

	String[] sents = null;
	String downloadedPage = null;
	try {
	if (snapshotMarked.length() != snapshot.length()) {
	downloadedPage = pFetcher.fetchPage(item.getUrl());
	if (downloadedPage != null && downloadedPage.length() > 100) {
	item.setPageContent(downloadedPage);
	String pageContent = Utils.fullStripHTML(item.getPageContent());
	pageContent = GeneratedSentenceProcessor
	.normalizeForSentenceSplitting(pageContent);
	pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);

	sents = sm.splitSentences(pageContent);

	sents = ContentGeneratorSupport.cleanListOfSents(sents);
	}
	}
	} catch (Exception e) {
	// TODO Auto-generated catch block
	// e.printStackTrace();
	System.err
	.println("Problem downloading the page and splitting into sentences");
	return new Triple(allFragms, downloadedPage, sents);
	}
	return new Triple(allFragms, downloadedPage, sents);
	}

	private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
	String[] mainAndFollowSent = null;

	List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
	String downloadedPage = (String)fragmentExtractionResults.getSecond();
	String[] sents = (String[])fragmentExtractionResults.getThird();

	String followSent = null;
	if (fragment.length() < 50)
	return null;
	String pageSentence = "";
	// try to find original sentence from webpage
	if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
	&& sents.length > 0){
	try {
	// first try sorted sentences from page by length approach
	String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);


	try {
	mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
	fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	// if the above gives null than try to match all sentences from snippet fragment
	if (mainAndFollowSent==null \|\| mainAndFollowSent[0]==null){
	mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
	fragment.replace("_should_find_orig_", ""), sents);
	}


	} catch (Exception e) {

	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}
	else
	// or get original snippet
	pageSentence = fragment;
	if (pageSentence != null)
	pageSentence.replace("_should_find_orig_", "");

	return mainAndFollowSent;

	}

	private Fragment verifyCandidateSentencesAndFormParagraph(
	String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
	Fragment result = null;

	String pageSentence = candidateSentences[0];
	String followSent = "";
	for(int i = 1; i< candidateSentences.length; i++)
	followSent+= candidateSentences[i];
	String title = item.getTitle();

	// resultant sentence SHOULD NOT be longer than for times the size of
	// snippet fragment
	if (!(pageSentence != null && pageSentence.length()>50
	&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )
	return null;


	try { // get score from syntactic match between sentence in
	// original text and mined sentence
	double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

	SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
	+ " " + title, originalSentence);
	List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
	if (!matchRes.isVerbExists() \|\| matchRes.isImperativeVerb()) {
	System.out
	.println("Rejected Sentence : No verb OR Yes imperative verb :"
	+ pageSentence);
	return null;
	}

	syntScore = parseTreeChunkListScorer
	.getParseTreeChunkListScore(match);
	System.out.println(parseTreeChunk.listToString(match) + " "
	+ syntScore + "\n pre-processed sent = '" + pageSentence);

	if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
	for (String currSent : sentsAll) {
	if (currSent.startsWith(originalSentence))
	continue;
	match = sm.assessRelevance(currSent, pageSentence)
	.getMatchResult();
	double syntScoreCurr = parseTreeChunkListScorer
	.getParseTreeChunkListScore(match);
	if (syntScoreCurr > syntScore) {
	syntScore = syntScoreCurr;
	}
	}
	if (syntScore > RELEVANCE_THRESHOLD) {
	System.out.println("Got match with other sent: "
	+ parseTreeChunk.listToString(match) + " " + syntScore);
	}
	}

	measScore = stringDistanceMeasurer.measureStringDistance(
	originalSentence, pageSentence);


	if ((syntScore > RELEVANCE_THRESHOLD \|\| measScore > 0.5)
	&& measScore < 0.8 && pageSentence.length() > 40) // >70
	{
	String pageSentenceProc = GeneratedSentenceProcessor
	.acceptableMinedSentence(pageSentence);
	if (pageSentenceProc != null) {
	pageSentenceProc = GeneratedSentenceProcessor
	.processSentence(pageSentenceProc);
	followSent = GeneratedSentenceProcessor.processSentence(followSent);
	if (followSent != null) {
	pageSentenceProc += " "+ followSent;
	}

	pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
	result = new Fragment(pageSentenceProc, syntScore + measScore
	+ mentalScore + (double) pageSentenceProc.length()
	/ (double) 50);
	result.setSourceURL(item.getUrl());
	result.fragment = fragment;

	System.out.println("Accepted sentence: " + pageSentenceProc
	+ "\| with title= " + title);
	System.out.println("For fragment = " + fragment);
	} else
	System.out
	.println("Rejected sentence due to wrong area at webpage: "
	+ pageSentence);
	} else
	System.out.println("Rejected sentence due to low score: "
	+ pageSentence);
	// }
	} catch (Throwable t) {
	t.printStackTrace();
	}

	return result;
	}
	/**
	* Takes single search result for an entity which is the subject of the essay
	* to be written and forms essey sentences from the title, abstract, and
	* possibly original page
	*
	* @param HitBase
	* item : search result
	* @param originalSentence
	* : seed for the essay to be written
	* @param sentsAll
	* : list<String> of other sentences in the seed if it is
	* multi-sentence
	* @return search result
	*/
	public HitBase buildParagraphOfGeneratedText(HitBase item,
	String originalSentence, List<String> sentsAll) {
	List<Fragment> results = new ArrayList<Fragment>() ;

	Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);

	List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
	String downloadedPage = (String)fragmentExtractionResults.getSecond();
	String[] sents = (String[])fragmentExtractionResults.getThird();

	for (String fragment : allFragms) {
	String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
	if (candidateSentences == null)
	continue;
	Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
	if (res!=null)
	results.add(res);

	}

	item.setFragments(results );
	return item;
	}




	public static void main(String[] args) {
	ContentGenerator f = new ContentGenerator();

	List<HitBase> hits = null;
	try {
	// uncomment the sentence you would like to serve as a seed sentence for
	// content generation for an event description

	// uncomment the sentence you would like to serve as a seed sentence for
	// content generation for an event description
	hits = f.generateContentAbout("Albert Einstein"
	// "Britney Spears - The Femme Fatale Tour"
	// "Rush Time Machine",
	// "Blue Man Group" ,
	// "Belly Dance With Zaharah",
	// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
	// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
	);
	System.out.println(HitBase.toString(hits));
	System.out.println(HitBase.toResultantString(hits));
	// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
	// hits.get(0).getTitle(), hits);

	} catch (Exception e) {
	e.printStackTrace();
	}

	}



	}