opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.logging.Logger;

 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 import opennlp.tools.similarity.apps.utils.Utils;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 import org.apache.commons.lang.StringUtils;

 /*
  * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
  * expected to be readable by humans.
  *
  * These are examples of generated articles, given the article title
  * http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
  * http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
  *
  */

 public class RelatedSentenceFinder {
   private static Logger LOG = Logger
       .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
   PageFetcher pFetcher = new PageFetcher();

   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
   private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

   static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

   // used to indicate that a sentence is an opinion, so more appropriate
   static List<String> MENTAL_VERBS = new ArrayList<String>(
       Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
           "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
           "check", "confirm", "convince", "deny", "disagree", "explain",
           "ignore", "inform", "remind", "request", "suggest", "suppose",
           "think", "threaten", "try", "understand" }));

   private static final int MAX_FRAGMENT_SENTS = 10;

   public RelatedSentenceFinder() {

   }

   public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
       List<String> sents) throws Exception {
     BingWebQueryRunner yrunner = new BingWebQueryRunner();
     List<HitBase> searchResult = yrunner.runSearch(word);
     return searchResult;
   }

   public List<HitBase> findRelatedOpinionsForSentence(String sentence,
       List<String> sents) throws Exception {
     List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
     System.out.println(" \n\n=== Sentence  = " + sentence);
     List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

     BingWebQueryRunner yrunner = new BingWebQueryRunner();
     for (String query : nounPhraseQueries) {
       System.out.println("\nquery = " + query);
       // query += " "+join(MENTAL_VERBS, " OR ") ;
       List<HitBase> searchResult = yrunner.runSearch(query);
       if (searchResult != null) {
         for (HitBase item : searchResult) { // got some text from .html
           if (item.getAbstractText() != null
               && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
                                                          // pdf
             opinionSentencesToAdd
                 .add(augmentWithMinedSentencesAndVerifyRelevance(item,
                     sentence, sents));
           }
         }
       }
     }

     opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
     return opinionSentencesToAdd;
   }

   /**
    * Main content generation function which takes a seed as a person, rock
    * group, or other entity name and produce a list of text fragments by web
    * mining for <br>
    *
    * @param String
    *          entity name
    * @return List<HitBase> of text fragment structures which contain approved
    *         (in terms of relevance) mined sentences, as well as original search
    *         results objects such as doc titles, abstracts, and urls.
    */

   public List<HitBase> generateContentAbout(String sentence) throws Exception {
     List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
     System.out.println(" \n=== Entity to write about = " + sentence);
     List<String> nounPhraseQueries = new ArrayList<String>();

     // nounPhraseQueries.add(sentence + frequentPerformingVerbs);

     BingWebQueryRunner yrunner = new BingWebQueryRunner();
     for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
       List<HitBase> searchResult = yrunner.runSearch(sentence + " "
           + verbAddition);
       if (searchResult != null) {
         for (HitBase item : searchResult) { // got some text from .html
           if (item.getAbstractText() != null
               && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
             opinionSentencesToAdd
                 .add(augmentWithMinedSentencesAndVerifyRelevance(item,
                     sentence, null));
           }
         }
       }
     }

     opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
     return opinionSentencesToAdd;
   }

   /**
    * Takes a sentence and extracts noun phrases and entity names to from search
    * queries for finding relevant sentences on the web, which are then subject
    * to relevance assessment by Similarity. Search queries should not be too
    * general (irrelevant search results) or too specific (too few search
    * results)
    *
    * @param String
    *          input sentence to form queries
    * @return List<String> of search expressions
    */
   public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
     ParseTreeChunk matcher = new ParseTreeChunk();
     ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
         .getInstance();
     List<List<ParseTreeChunk>> sent1GrpLst = null;

     List<ParseTreeChunk> nPhrases = pos
         .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
     List<String> queryArrayStr = new ArrayList<String>();
     for (ParseTreeChunk ch : nPhrases) {
       String query = "";
       int size = ch.getLemmas().size();

       for (int i = 0; i < size; i++) {
         if (ch.getPOSs().get(i).startsWith("N")
             || ch.getPOSs().get(i).startsWith("J")) {
           query += ch.getLemmas().get(i) + " ";
         }
       }
       query = query.trim();
       int len = query.split(" ").length;
       if (len < 2 || len > 5)
         continue;
       if (len < 4) { // every word should start with capital
         String[] qs = query.split(" ");
         boolean bAccept = true;
         for (String w : qs) {
           if (w.toLowerCase().equals(w)) // idf only two words then
             // has to be person name,
             // title or geo location
             bAccept = false;
         }
         if (!bAccept)
           continue;
       }

       query = query.trim().replace(" ", " +");
       query = " +" + query;

       queryArrayStr.add(query);

     }
     if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
                                     // keywords
       for (ParseTreeChunk ch : nPhrases) {
         String query = "";
         int size = ch.getLemmas().size();

         for (int i = 0; i < size; i++) {
           if (ch.getPOSs().get(i).startsWith("N")
               || ch.getPOSs().get(i).startsWith("J")) {
             query += ch.getLemmas().get(i) + " ";
           }
         }
         query = query.trim();
         int len = query.split(" ").length;
         if (len < 2)
           continue;

         query = query.trim().replace(" ", " +");
         query = " +" + query;

         queryArrayStr.add(query);

       }
     }

     queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
     queryArrayStr.add(sentence);

     return queryArrayStr;

   }

   /**
    * remove dupes from queries to easy cleaning dupes and repetitive search
    * afterwards
    *
    * @param List
    *          <String> of sentences (search queries, or search results
    *          abstracts, or titles
    * @return List<String> of sentences where dupes are removed
    */
   public static List<String> removeDuplicatesFromQueries(List<String> hits) {
     StringDistanceMeasurer meas = new StringDistanceMeasurer();
     double dupeThresh = 0.8; // if more similar, then considered dupes was
     // 0.7
     List<Integer> idsToRemove = new ArrayList<Integer>();
     List<String> hitsDedup = new ArrayList<String>();
     try {
       for (int i = 0; i < hits.size(); i++)
         for (int j = i + 1; j < hits.size(); j++) {
           String title1 = hits.get(i);
           String title2 = hits.get(j);
           if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
             continue;
           if (meas.measureStringDistance(title1, title2) > dupeThresh) {
             idsToRemove.add(j); // dupes found, later list member to
             // be deleted

           }
         }

       for (int i = 0; i < hits.size(); i++)
         if (!idsToRemove.contains(i))
           hitsDedup.add(hits.get(i));

       if (hitsDedup.size() < hits.size()) {
         LOG.info("Removed duplicates from formed query, including "
             + hits.get(idsToRemove.get(0)));
       }

     } catch (Exception e) {
       LOG.severe("Problem removing duplicates from query list");
     }

     return hitsDedup;

   }

   /**
    * remove dupes from search results
    *
    * @param List
    *          <HitBase> of search results objects
    * @return List<String> of search results objects where dupes are removed
    */
   public static List<HitBase> removeDuplicatesFromResultantHits(
       List<HitBase> hits) {
     StringDistanceMeasurer meas = new StringDistanceMeasurer();
     double dupeThresh = // 0.8; // if more similar, then considered dupes was
     0.7;
     List<Integer> idsToRemove = new ArrayList<Integer>();
     List<HitBase> hitsDedup = new ArrayList<HitBase>();
     try {
       for (int i = 0; i < hits.size(); i++)
         for (int j = i + 1; j < hits.size(); j++) {
           HitBase hit2 = hits.get(j);
           List<Fragment> fragmList1 = hits.get(i).getFragments();
           List<Fragment> fragmList2 = hits.get(j).getFragments();
           List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
           for (Fragment f1 : fragmList1)
             for (Fragment f2 : fragmList2) {
               String sf1 = f1.getResultText();
               String sf2 = f2.getResultText();
               if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
                 continue;
               if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
                 fragmList2Results.remove(f2);
                 LOG.info("Removed duplicates from formed fragments list: "
                     + sf2);
               }
             }

           hit2.setFragments(fragmList2Results);
           hits.set(j, hit2);
         }
     } catch (Exception e) {
       LOG.severe("Problem removing duplicates from list of fragment");
     }
     return hits;
   }

   /**
    * Takes single search result for an entity which is the subject of the essay
    * to be written and forms essey sentences from the title, abstract, and
    * possibly original page
    *
    * @param HitBase
    *          item : search result
    * @param originalSentence
    *          : seed for the essay to be written
    * @param sentsAll
    *          : list<String> of other sentences in the seed if it is
    *          multi-sentence
    * @return search result
    */

   public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
       String originalSentence, List<String> sentsAll) {
     if (sentsAll == null)
       sentsAll = new ArrayList<String>();
     // put orig sentence in structure
     List<String> origs = new ArrayList<String>();
     origs.add(originalSentence);
     item.setOriginalSentences(origs);
     String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
         .replace("  ", " ").replace("  ", " ");
     // generation results for this sentence
     List<Fragment> result = new ArrayList<Fragment>();
     // form plain text from snippet
     String snapshot = item.getAbstractText().replace("<b>", " ")
         .replace("</b>", " ").replace("  ", " ").replace("  ", " ");

     ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
         .getInstance();
     // fix a template expression which can be substituted by original if
     // relevant
     String snapshotMarked = snapshot.replace("...",
         " _should_find_orig_ . _should_find_orig_");
     String[] fragments = sm.splitSentences(snapshotMarked);
     List<String> allFragms = new ArrayList<String>();
     allFragms.addAll(Arrays.asList(fragments));

     String[] sents = null;
     String downloadedPage;
     try {
       if (snapshotMarked.length() != snapshot.length()) {
         downloadedPage = pFetcher.fetchPage(item.getUrl());
         if (downloadedPage != null && downloadedPage.length() > 100) {
           item.setPageContent(downloadedPage);
           String pageContent = Utils.fullStripHTML(item.getPageContent());
           pageContent = GeneratedSentenceProcessor
               .normalizeForSentenceSplitting(pageContent);
           pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
                                                                         // ". ")
               .replace("..", ".").replace(". . .", " ").trim(); // sometimes
                                                                 // html breaks
                                                                 // are converted
                                                                 // into ' ' (two
                                                                 // spaces), so
                                                                 // we need to
                                                                 // put '.'
           sents = sm.splitSentences(snapshotMarked);
           ;
           sents = cleanListOfSents(sents);
         }
       }
     } catch (Exception e) {
       // TODO Auto-generated catch block
       // e.printStackTrace();
       System.err
           .println("Problem downloading  the page and splitting into sentences");
       return item;
     }

     for (String fragment : allFragms) {
       String followSent = null;
       if (fragment.length() < 50)
         continue;
       String pageSentence = "";
       // try to find original sentence from webpage
       if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
           && sents.length > 0)
         try {
           String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
               fragment.replace("_should_find_orig_", ""), sents);
           pageSentence = mainAndFollowSent[0];
           followSent = mainAndFollowSent[1];

         } catch (Exception e) {

           // TODO Auto-generated catch block
           e.printStackTrace();
         }
       else
         // or get original snippet
         pageSentence = fragment;
       if (pageSentence != null)
         pageSentence.replace("_should_find_orig_", "");

       // resultant sentence SHOULD NOT be longer than twice the size of
       // snippet fragment
       if (pageSentence != null
           && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
                                                                                 // 2.0,
                                                                                 // but
                                                                                 // since
                                                                                 // snippet
                                                                                 // sentences
                                                                                 // are
                                                                                 // rather
                                                                                 // short
                                                                                 // now...
         try { // get score from syntactic match between sentence in
               // original text and mined sentence
           double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

           SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
               + " " + title, originalSentence);
           List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
           if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
             System.out
                 .println("Rejected Sentence : No verb OR Yes imperative verb :"
                     + pageSentence);
             continue;
           }

           syntScore = parseTreeChunkListScorer
               .getParseTreeChunkListScore(match);
           System.out.println(parseTreeChunk.listToString(match) + " "
               + syntScore + "\n pre-processed sent = '" + pageSentence);

           if (syntScore < 1.5) { // trying other sents
             for (String currSent : sentsAll) {
               if (currSent.startsWith(originalSentence))
                 continue;
               match = sm.assessRelevance(currSent, pageSentence)
                   .getMatchResult();
               double syntScoreCurr = parseTreeChunkListScorer
                   .getParseTreeChunkListScore(match);
               if (syntScoreCurr > syntScore) {
                 syntScore = syntScoreCurr;
               }
             }
             if (syntScore > 1.5) {
               System.out.println("Got match with other sent: "
                   + parseTreeChunk.listToString(match) + " " + syntScore);
             }
           }

           measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
               originalSentence, pageSentence);

           // now possibly increase score by finding mental verbs
           // indicating opinions
           for (String s : MENTAL_VERBS) {
             if (pageSentence.indexOf(s) > -1) {
               mentalScore += 0.3;
               break;
             }
           }

           if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
               && measScore < 0.8 && pageSentence.length() > 40) // >70
           {
             String pageSentenceProc = GeneratedSentenceProcessor
                 .acceptableMinedSentence(pageSentence);
             if (pageSentenceProc != null) {
               pageSentenceProc = GeneratedSentenceProcessor
                   .processSentence(pageSentenceProc);
               if (followSent != null) {
                 pageSentenceProc += " "
                     + GeneratedSentenceProcessor.processSentence(followSent);
               }

               pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
               Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
                   + mentalScore + (double) pageSentenceProc.length()
                   / (double) 50);
               f.setSourceURL(item.getUrl());
               f.fragment = fragment;
               result.add(f);
               System.out.println("Accepted sentence: " + pageSentenceProc
                   + "| with title= " + title);
               System.out.println("For fragment = " + fragment);
             } else
               System.out
                   .println("Rejected sentence due to wrong area at webpage: "
                       + pageSentence);
           } else
             System.out.println("Rejected sentence due to low score: "
                 + pageSentence);
           // }
         } catch (Throwable t) {
           t.printStackTrace();
         }
       }
     }
     item.setFragments(result);
     return item;
   }

   public static String[] cleanListOfSents(String[] sents) {
     List<String> sentsClean = new ArrayList<String>();
     for (String s : sents) {
       if (s == null || s.trim().length() < 30 || s.length() < 20)
         continue;
       sentsClean.add(s);
     }
     return (String[]) sentsClean.toArray(new String[0]);
   }

   // given a fragment from snippet, finds an original sentence at a webpage by
   // optimizing alignmemt score
   public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
       String fragment, String[] sents) {
     if (fragment.trim().length() < 15)
       return null;

     StringDistanceMeasurer meas = new StringDistanceMeasurer();
     Double dist = 0.0;
     String result = null, followSent = null;
     for (int i = 0; i < sents.length; i++) {
       String s = sents[i];
       if (s == null || s.length() < 30)
         continue;
       Double distCurr = meas.measureStringDistance(s, fragment);
       if (distCurr > dist && distCurr > 0.4) {
         result = s;
         dist = distCurr;
         if (i < sents.length - 1 && sents[i + 1].length() > 60) {
           followSent = sents[i + 1];
         }

       }
     }
     return new String[] { result, followSent };
   }

   // given a fragment from snippet, finds an original sentence at a webpage by
   // optimizing alignmemt score
   public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
       String fragment, String[] sents) {
     if (fragment.trim().length() < 15)
       return null;
     int bestSentIndex = -1;
     StringDistanceMeasurer meas = new StringDistanceMeasurer();
     Double distBest = 10.0; // + sup
     String result = null, followSent = null;
     for (int i = 0; i < sents.length; i++) {
       String s = sents[i];
       if (s == null || s.length() < 30)
         continue;
       Double distCurr = meas.measureStringDistance(s, fragment);
       if (distCurr > distBest) {
         distBest = distCurr;
         bestSentIndex = i;
       }

     }
     if (distBest > 0.4) {
       result = sents[bestSentIndex];

       if (bestSentIndex < sents.length - 1
           && sents[bestSentIndex + 1].length() > 60) {
         followSent = sents[bestSentIndex + 1];
       }

     }

     return new String[] { result, followSent };
   }

   public static void main(String[] args) {
     RelatedSentenceFinder f = new RelatedSentenceFinder();

     List<HitBase> hits = null;
     try {
       // uncomment the sentence you would like to serve as a seed sentence for
       // content generation for an event description

       // uncomment the sentence you would like to serve as a seed sentence for
       // content generation for an event description
       hits = f.generateContentAbout("Albert Einstein"
       // "Britney Spears - The Femme Fatale Tour"
       // "Rush Time Machine",
       // "Blue Man Group" ,
       // "Belly Dance With Zaharah",
       // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
       // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
           );
       System.out.println(HitBase.toString(hits));
       System.out.println(HitBase.toResultantString(hits));
       // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
       // hits.get(0).getTitle(), hits);

     } catch (Exception e) {
       e.printStackTrace();
     }

   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.similarity.apps;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.logging.Logger;

	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
	import opennlp.tools.similarity.apps.utils.Utils;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	import org.apache.commons.lang.StringUtils;

	/*
	* This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
	* expected to be readable by humans.
	*
	* These are examples of generated articles, given the article title
	* http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
	* http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
	*
	*/

	public class RelatedSentenceFinder {
	private static Logger LOG = Logger
	.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
	PageFetcher pFetcher = new PageFetcher();

	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
	private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

	static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

	// used to indicate that a sentence is an opinion, so more appropriate
	static List<String> MENTAL_VERBS = new ArrayList<String>(
	Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
	"accept", "agree", "allow", "appeal", "ask", "assume", "believe",
	"check", "confirm", "convince", "deny", "disagree", "explain",
	"ignore", "inform", "remind", "request", "suggest", "suppose",
	"think", "threaten", "try", "understand" }));

	private static final int MAX_FRAGMENT_SENTS = 10;

	public RelatedSentenceFinder() {

	}

	public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
	List<String> sents) throws Exception {
	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	List<HitBase> searchResult = yrunner.runSearch(word);
	return searchResult;
	}

	public List<HitBase> findRelatedOpinionsForSentence(String sentence,
	List<String> sents) throws Exception {
	List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
	System.out.println(" \n\n=== Sentence = " + sentence);
	List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	for (String query : nounPhraseQueries) {
	System.out.println("\nquery = " + query);
	// query += " "+join(MENTAL_VERBS, " OR ") ;
	List<HitBase> searchResult = yrunner.runSearch(query);
	if (searchResult != null) {
	for (HitBase item : searchResult) { // got some text from .html
	if (item.getAbstractText() != null
	&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
	// pdf
	opinionSentencesToAdd
	.add(augmentWithMinedSentencesAndVerifyRelevance(item,
	sentence, sents));
	}
	}
	}
	}

	opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
	return opinionSentencesToAdd;
	}

	/**
	* Main content generation function which takes a seed as a person, rock
	* group, or other entity name and produce a list of text fragments by web
	* mining for <br>
	*
	* @param String
	* entity name
	* @return List<HitBase> of text fragment structures which contain approved
	* (in terms of relevance) mined sentences, as well as original search
	* results objects such as doc titles, abstracts, and urls.
	*/

	public List<HitBase> generateContentAbout(String sentence) throws Exception {
	List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
	System.out.println(" \n=== Entity to write about = " + sentence);
	List<String> nounPhraseQueries = new ArrayList<String>();

	// nounPhraseQueries.add(sentence + frequentPerformingVerbs);

	BingWebQueryRunner yrunner = new BingWebQueryRunner();
	for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
	List<HitBase> searchResult = yrunner.runSearch(sentence + " "
	+ verbAddition);
	if (searchResult != null) {
	for (HitBase item : searchResult) { // got some text from .html
	if (item.getAbstractText() != null
	&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
	opinionSentencesToAdd
	.add(augmentWithMinedSentencesAndVerifyRelevance(item,
	sentence, null));
	}
	}
	}
	}

	opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
	return opinionSentencesToAdd;
	}

	/**
	* Takes a sentence and extracts noun phrases and entity names to from search
	* queries for finding relevant sentences on the web, which are then subject
	* to relevance assessment by Similarity. Search queries should not be too
	* general (irrelevant search results) or too specific (too few search
	* results)
	*
	* @param String
	* input sentence to form queries
	* @return List<String> of search expressions
	*/
	public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
	ParseTreeChunk matcher = new ParseTreeChunk();
	ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
	.getInstance();
	List<List<ParseTreeChunk>> sent1GrpLst = null;

	List<ParseTreeChunk> nPhrases = pos
	.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
	List<String> queryArrayStr = new ArrayList<String>();
	for (ParseTreeChunk ch : nPhrases) {
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++) {
	if (ch.getPOSs().get(i).startsWith("N")
	\|\| ch.getPOSs().get(i).startsWith("J")) {
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2 \|\| len > 5)
	continue;
	if (len < 4) { // every word should start with capital
	String[] qs = query.split(" ");
	boolean bAccept = true;
	for (String w : qs) {
	if (w.toLowerCase().equals(w)) // idf only two words then
	// has to be person name,
	// title or geo location
	bAccept = false;
	}
	if (!bAccept)
	continue;
	}

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
	// keywords
	for (ParseTreeChunk ch : nPhrases) {
	String query = "";
	int size = ch.getLemmas().size();

	for (int i = 0; i < size; i++) {
	if (ch.getPOSs().get(i).startsWith("N")
	\|\| ch.getPOSs().get(i).startsWith("J")) {
	query += ch.getLemmas().get(i) + " ";
	}
	}
	query = query.trim();
	int len = query.split(" ").length;
	if (len < 2)
	continue;

	query = query.trim().replace(" ", " +");
	query = " +" + query;

	queryArrayStr.add(query);

	}
	}

	queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
	queryArrayStr.add(sentence);

	return queryArrayStr;

	}

	/**
	* remove dupes from queries to easy cleaning dupes and repetitive search
	* afterwards
	*
	* @param List
	* <String> of sentences (search queries, or search results
	* abstracts, or titles
	* @return List<String> of sentences where dupes are removed
	*/
	public static List<String> removeDuplicatesFromQueries(List<String> hits) {
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	double dupeThresh = 0.8; // if more similar, then considered dupes was
	// 0.7
	List<Integer> idsToRemove = new ArrayList<Integer>();
	List<String> hitsDedup = new ArrayList<String>();
	try {
	for (int i = 0; i < hits.size(); i++)
	for (int j = i + 1; j < hits.size(); j++) {
	String title1 = hits.get(i);
	String title2 = hits.get(j);
	if (StringUtils.isEmpty(title1) \|\| StringUtils.isEmpty(title2))
	continue;
	if (meas.measureStringDistance(title1, title2) > dupeThresh) {
	idsToRemove.add(j); // dupes found, later list member to
	// be deleted

	}
	}

	for (int i = 0; i < hits.size(); i++)
	if (!idsToRemove.contains(i))
	hitsDedup.add(hits.get(i));

	if (hitsDedup.size() < hits.size()) {
	LOG.info("Removed duplicates from formed query, including "
	+ hits.get(idsToRemove.get(0)));
	}

	} catch (Exception e) {
	LOG.severe("Problem removing duplicates from query list");
	}

	return hitsDedup;

	}

	/**
	* remove dupes from search results
	*
	* @param List
	* <HitBase> of search results objects
	* @return List<String> of search results objects where dupes are removed
	*/
	public static List<HitBase> removeDuplicatesFromResultantHits(
	List<HitBase> hits) {
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	double dupeThresh = // 0.8; // if more similar, then considered dupes was
	0.7;
	List<Integer> idsToRemove = new ArrayList<Integer>();
	List<HitBase> hitsDedup = new ArrayList<HitBase>();
	try {
	for (int i = 0; i < hits.size(); i++)
	for (int j = i + 1; j < hits.size(); j++) {
	HitBase hit2 = hits.get(j);
	List<Fragment> fragmList1 = hits.get(i).getFragments();
	List<Fragment> fragmList2 = hits.get(j).getFragments();
	List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
	for (Fragment f1 : fragmList1)
	for (Fragment f2 : fragmList2) {
	String sf1 = f1.getResultText();
	String sf2 = f2.getResultText();
	if (StringUtils.isEmpty(sf1) \|\| StringUtils.isEmpty(sf1))
	continue;
	if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
	fragmList2Results.remove(f2);
	LOG.info("Removed duplicates from formed fragments list: "
	+ sf2);
	}
	}

	hit2.setFragments(fragmList2Results);
	hits.set(j, hit2);
	}
	} catch (Exception e) {
	LOG.severe("Problem removing duplicates from list of fragment");
	}
	return hits;
	}

	/**
	* Takes single search result for an entity which is the subject of the essay
	* to be written and forms essey sentences from the title, abstract, and
	* possibly original page
	*
	* @param HitBase
	* item : search result
	* @param originalSentence
	* : seed for the essay to be written
	* @param sentsAll
	* : list<String> of other sentences in the seed if it is
	* multi-sentence
	* @return search result
	*/

	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
	String originalSentence, List<String> sentsAll) {
	if (sentsAll == null)
	sentsAll = new ArrayList<String>();
	// put orig sentence in structure
	List<String> origs = new ArrayList<String>();
	origs.add(originalSentence);
	item.setOriginalSentences(origs);
	String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
	.replace(" ", " ").replace(" ", " ");
	// generation results for this sentence
	List<Fragment> result = new ArrayList<Fragment>();
	// form plain text from snippet
	String snapshot = item.getAbstractText().replace("<b>", " ")
	.replace("</b>", " ").replace(" ", " ").replace(" ", " ");

	ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
	.getInstance();
	// fix a template expression which can be substituted by original if
	// relevant
	String snapshotMarked = snapshot.replace("...",
	" _should_find_orig_ . _should_find_orig_");
	String[] fragments = sm.splitSentences(snapshotMarked);
	List<String> allFragms = new ArrayList<String>();
	allFragms.addAll(Arrays.asList(fragments));

	String[] sents = null;
	String downloadedPage;
	try {
	if (snapshotMarked.length() != snapshot.length()) {
	downloadedPage = pFetcher.fetchPage(item.getUrl());
	if (downloadedPage != null && downloadedPage.length() > 100) {
	item.setPageContent(downloadedPage);
	String pageContent = Utils.fullStripHTML(item.getPageContent());
	pageContent = GeneratedSentenceProcessor
	.normalizeForSentenceSplitting(pageContent);
	pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
	// ". ")
	.replace("..", ".").replace(". . .", " ").trim(); // sometimes
	// html breaks
	// are converted
	// into ' ' (two
	// spaces), so
	// we need to
	// put '.'
	sents = sm.splitSentences(snapshotMarked);
	;
	sents = cleanListOfSents(sents);
	}
	}
	} catch (Exception e) {
	// TODO Auto-generated catch block
	// e.printStackTrace();
	System.err
	.println("Problem downloading the page and splitting into sentences");
	return item;
	}

	for (String fragment : allFragms) {
	String followSent = null;
	if (fragment.length() < 50)
	continue;
	String pageSentence = "";
	// try to find original sentence from webpage
	if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
	&& sents.length > 0)
	try {
	String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
	fragment.replace("_should_find_orig_", ""), sents);
	pageSentence = mainAndFollowSent[0];
	followSent = mainAndFollowSent[1];

	} catch (Exception e) {

	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	else
	// or get original snippet
	pageSentence = fragment;
	if (pageSentence != null)
	pageSentence.replace("_should_find_orig_", "");

	// resultant sentence SHOULD NOT be longer than twice the size of
	// snippet fragment
	if (pageSentence != null
	&& (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
	// 2.0,
	// but
	// since
	// snippet
	// sentences
	// are
	// rather
	// short
	// now...
	try { // get score from syntactic match between sentence in
	// original text and mined sentence
	double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

	SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
	+ " " + title, originalSentence);
	List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
	if (!matchRes.isVerbExists() \|\| matchRes.isImperativeVerb()) {
	System.out
	.println("Rejected Sentence : No verb OR Yes imperative verb :"
	+ pageSentence);
	continue;
	}

	syntScore = parseTreeChunkListScorer
	.getParseTreeChunkListScore(match);
	System.out.println(parseTreeChunk.listToString(match) + " "
	+ syntScore + "\n pre-processed sent = '" + pageSentence);

	if (syntScore < 1.5) { // trying other sents
	for (String currSent : sentsAll) {
	if (currSent.startsWith(originalSentence))
	continue;
	match = sm.assessRelevance(currSent, pageSentence)
	.getMatchResult();
	double syntScoreCurr = parseTreeChunkListScorer
	.getParseTreeChunkListScore(match);
	if (syntScoreCurr > syntScore) {
	syntScore = syntScoreCurr;
	}
	}
	if (syntScore > 1.5) {
	System.out.println("Got match with other sent: "
	+ parseTreeChunk.listToString(match) + " " + syntScore);
	}
	}

	measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
	originalSentence, pageSentence);

	// now possibly increase score by finding mental verbs
	// indicating opinions
	for (String s : MENTAL_VERBS) {
	if (pageSentence.indexOf(s) > -1) {
	mentalScore += 0.3;
	break;
	}
	}

	if ((syntScore > 1.5 \|\| measScore > 0.5 \|\| mentalScore > 0.5)
	&& measScore < 0.8 && pageSentence.length() > 40) // >70
	{
	String pageSentenceProc = GeneratedSentenceProcessor
	.acceptableMinedSentence(pageSentence);
	if (pageSentenceProc != null) {
	pageSentenceProc = GeneratedSentenceProcessor
	.processSentence(pageSentenceProc);
	if (followSent != null) {
	pageSentenceProc += " "
	+ GeneratedSentenceProcessor.processSentence(followSent);
	}

	pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
	Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
	+ mentalScore + (double) pageSentenceProc.length()
	/ (double) 50);
	f.setSourceURL(item.getUrl());
	f.fragment = fragment;
	result.add(f);
	System.out.println("Accepted sentence: " + pageSentenceProc
	+ "\| with title= " + title);
	System.out.println("For fragment = " + fragment);
	} else
	System.out
	.println("Rejected sentence due to wrong area at webpage: "
	+ pageSentence);
	} else
	System.out.println("Rejected sentence due to low score: "
	+ pageSentence);
	// }
	} catch (Throwable t) {
	t.printStackTrace();
	}
	}
	}
	item.setFragments(result);
	return item;
	}

	public static String[] cleanListOfSents(String[] sents) {
	List<String> sentsClean = new ArrayList<String>();
	for (String s : sents) {
	if (s == null \|\| s.trim().length() < 30 \|\| s.length() < 20)
	continue;
	sentsClean.add(s);
	}
	return (String[]) sentsClean.toArray(new String[0]);
	}

	// given a fragment from snippet, finds an original sentence at a webpage by
	// optimizing alignmemt score
	public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
	String fragment, String[] sents) {
	if (fragment.trim().length() < 15)
	return null;

	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	Double dist = 0.0;
	String result = null, followSent = null;
	for (int i = 0; i < sents.length; i++) {
	String s = sents[i];
	if (s == null \|\| s.length() < 30)
	continue;
	Double distCurr = meas.measureStringDistance(s, fragment);
	if (distCurr > dist && distCurr > 0.4) {
	result = s;
	dist = distCurr;
	if (i < sents.length - 1 && sents[i + 1].length() > 60) {
	followSent = sents[i + 1];
	}

	}
	}
	return new String[] { result, followSent };
	}

	// given a fragment from snippet, finds an original sentence at a webpage by
	// optimizing alignmemt score
	public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
	String fragment, String[] sents) {
	if (fragment.trim().length() < 15)
	return null;
	int bestSentIndex = -1;
	StringDistanceMeasurer meas = new StringDistanceMeasurer();
	Double distBest = 10.0; // + sup
	String result = null, followSent = null;
	for (int i = 0; i < sents.length; i++) {
	String s = sents[i];
	if (s == null \|\| s.length() < 30)
	continue;
	Double distCurr = meas.measureStringDistance(s, fragment);
	if (distCurr > distBest) {
	distBest = distCurr;
	bestSentIndex = i;
	}

	}
	if (distBest > 0.4) {
	result = sents[bestSentIndex];

	if (bestSentIndex < sents.length - 1
	&& sents[bestSentIndex + 1].length() > 60) {
	followSent = sents[bestSentIndex + 1];
	}

	}

	return new String[] { result, followSent };
	}

	public static void main(String[] args) {
	RelatedSentenceFinder f = new RelatedSentenceFinder();

	List<HitBase> hits = null;
	try {
	// uncomment the sentence you would like to serve as a seed sentence for
	// content generation for an event description

	// uncomment the sentence you would like to serve as a seed sentence for
	// content generation for an event description
	hits = f.generateContentAbout("Albert Einstein"
	// "Britney Spears - The Femme Fatale Tour"
	// "Rush Time Machine",
	// "Blue Man Group" ,
	// "Belly Dance With Zaharah",
	// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
	// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
	);
	System.out.println(HitBase.toString(hits));
	System.out.println(HitBase.toResultantString(hits));
	// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
	// hits.get(0).getTitle(), hits);

	} catch (Exception e) {
	e.printStackTrace();
	}

	}

	}