blob: cd53d9f52d99577d823ff29721f1c0bad1506df8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Logger;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import org.apache.commons.lang.StringUtils;
/*
* This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form
* expected to be readable by humans.
*
* These are examples of generated articles, given the article title
* http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
* http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
*
*/
public class RelatedSentenceFinder
{
private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
PageFetcher pFetcher = new PageFetcher();
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();
// used to indicate that a sentence is an opinion, so more appropriate
static List<String> MENTAL_VERBS = new ArrayList<String>(Arrays.asList(new String[] { "want", "know", "believe",
"appeal", "ask", "accept", "agree", "allow", "appeal", "ask", "assume", "believe", "check", "confirm",
"convince", "deny", "disagree", "explain", "ignore", "inform", "remind", "request", "suggest", "suppose",
"think", "threaten", "try", "understand" }));
private static final int MAX_FRAGMENT_SENTS = 10;
public RelatedSentenceFinder()
{
}
public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word, List<String> sents) throws Exception
{
BingWebQueryRunner yrunner = new BingWebQueryRunner();
List<HitBase> searchResult = yrunner.runSearch(word);
return searchResult;
}
public List<HitBase> findRelatedOpinionsForSentence(String sentence, List<String> sents) throws Exception
{
List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
System.out.println(" \n\n=== Sentence = " + sentence);
List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);
BingWebQueryRunner yrunner = new BingWebQueryRunner();
for (String query : nounPhraseQueries)
{
System.out.println("\nquery = " + query);
// query += " "+join(MENTAL_VERBS, " OR ") ;
List<HitBase> searchResult = yrunner.runSearch(query);
if (searchResult != null)
{
for (HitBase item : searchResult)
{ // got some text from .html
if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
{ // exclude
// pdf
opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, sents));
}
}
}
}
opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
return opinionSentencesToAdd;
}
/**
* Main content generation function which takes a seed as a person, rock group, or other entity name and produce a list of text fragments by web mining for
* <br>
* @param String entity name
* @return List<HitBase> of text fragment structures which contain approved (in terms of relevance) mined sentences, as well as original search results objects
* such as doc titles, abstracts, and urls.
*/
public List<HitBase> generateContentAbout(String sentence) throws Exception
{
List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
System.out.println(" \n=== Entity to write about = " + sentence);
List<String> nounPhraseQueries = new ArrayList<String>();
//nounPhraseQueries.add(sentence + frequentPerformingVerbs);
BingWebQueryRunner yrunner = new BingWebQueryRunner();
for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs)
{
List<HitBase> searchResult = yrunner.runSearch(sentence + " " + verbAddition);
if (searchResult != null)
{
for (HitBase item : searchResult)
{ // got some text from .html
if (item.getAbstractText() != null && !(item.getUrl().indexOf(".pdf") > 0))
{ // exclude pdf
opinionSentencesToAdd.add(augmentWithMinedSentencesAndVerifyRelevance(item, sentence, null));
}
}
}
}
opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
return opinionSentencesToAdd;
}
/**
* Takes a sentence and extracts noun phrases and entity names to from search queries for finding relevant sentences on the web, which are
* then subject to relevance assessment by Similarity. Search queries should not be too general (irrelevant search results) or too specific (too few
* search results)
* @param String input sentence to form queries
* @return List<String> of search expressions
*/
public static List<String> buildSearchEngineQueryFromSentence(String sentence)
{
ParseTreeChunk matcher = new ParseTreeChunk();
ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();
List<List<ParseTreeChunk>> sent1GrpLst = null;
List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
List<String> queryArrayStr = new ArrayList<String>();
for (ParseTreeChunk ch : nPhrases)
{
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++)
{
if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
{
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2 || len > 5)
continue;
if (len < 4)
{ // every word should start with capital
String[] qs = query.split(" ");
boolean bAccept = true;
for (String w : qs)
{
if (w.toLowerCase().equals(w)) // idf only two words then
// has to be person name,
// title or geo location
bAccept = false;
}
if (!bAccept)
continue;
}
query = query.trim().replace(" ", " +");
query = " +" + query;
queryArrayStr.add(query);
}
if (queryArrayStr.size() < 1)
{ // release constraints on NP down to 2
// keywords
for (ParseTreeChunk ch : nPhrases)
{
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++)
{
if (ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i).startsWith("J"))
{
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2)
continue;
query = query.trim().replace(" ", " +");
query = " +" + query;
queryArrayStr.add(query);
}
}
queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
queryArrayStr.add(sentence);
return queryArrayStr;
}
/** remove dupes from queries to easy cleaning dupes and repetitive search
* afterwards
*
* @param List<String> of sentences (search queries, or search results abstracts, or titles
* @return List<String> of sentences where dupes are removed
*/
public static List<String> removeDuplicatesFromQueries(List<String> hits)
{
StringDistanceMeasurer meas = new StringDistanceMeasurer();
double dupeThresh = 0.8; // if more similar, then considered dupes was
// 0.7
List<Integer> idsToRemove = new ArrayList<Integer>();
List<String> hitsDedup = new ArrayList<String>();
try
{
for (int i = 0; i < hits.size(); i++)
for (int j = i + 1; j < hits.size(); j++)
{
String title1 = hits.get(i);
String title2 = hits.get(j);
if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
continue;
if (meas.measureStringDistance(title1, title2) > dupeThresh)
{
idsToRemove.add(j); // dupes found, later list member to
// be deleted
}
}
for (int i = 0; i < hits.size(); i++)
if (!idsToRemove.contains(i))
hitsDedup.add(hits.get(i));
if (hitsDedup.size() < hits.size())
{
LOG.info("Removed duplicates from formed query, including " + hits.get(idsToRemove.get(0)));
}
}
catch (Exception e)
{
LOG.severe("Problem removing duplicates from query list");
}
return hitsDedup;
}
/** remove dupes from search results
*
* @param List<HitBase> of search results objects
* @return List<String> of search results objects where dupes are removed
*/
public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits)
{
StringDistanceMeasurer meas = new StringDistanceMeasurer();
double dupeThresh = //0.8; // if more similar, then considered dupes was
0.7;
List<Integer> idsToRemove = new ArrayList<Integer>();
List<HitBase> hitsDedup = new ArrayList<HitBase>();
try
{
for (int i = 0; i < hits.size(); i++)
for (int j = i + 1; j < hits.size(); j++)
{
HitBase hit2 = hits.get(j);
List<Fragment> fragmList1 = hits.get(i).getFragments();
List<Fragment> fragmList2 = hits.get(j).getFragments();
List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
for(Fragment f1: fragmList1)
for(Fragment f2: fragmList2){
String sf1 = f1.getResultText();
String sf2 = f2.getResultText();
if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
continue;
if (meas.measureStringDistance(sf1, sf2) > dupeThresh)
{
fragmList2Results.remove(f2);
LOG.info("Removed duplicates from formed fragments list: " + sf2);
}
}
hit2.setFragments(fragmList2Results);
hits.set(j, hit2 );
}
}
catch (Exception e)
{
LOG.severe("Problem removing duplicates from list of fragment");
}
return hits;
}
/**
* Takes single search result for an entity which is the subject of the essay to be written and forms essey sentences
* from the title, abstract, and possibly original page
* @param HitBase item : search result
* @param originalSentence : seed for the essay to be written
* @param sentsAll: list<String> of other sentences in the seed if it is multi-sentence
* @return search result
*/
public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, String originalSentence,
List<String> sentsAll)
{
if (sentsAll==null)
sentsAll = new ArrayList<String>();
// put orig sentence in structure
List<String> origs = new ArrayList<String>();
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ").replace(" ", " ").replace(" ", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<Fragment>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ").replace("</b>", " ").replace(" ", " ")
.replace(" ", " ");
ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance();
// fix a template expression which can be substituted by original if
// relevant
String snapshotMarked = snapshot.replace("...", " _should_find_orig_ . _should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new ArrayList<String>();
allFragms.addAll(Arrays.asList(fragments));
String[] sents = null; String downloadedPage;
try
{
if (snapshotMarked.length() != snapshot.length())
{
downloadedPage = pFetcher.fetchPage(item.getUrl());
if (downloadedPage != null && downloadedPage.length() > 100)
{
item.setPageContent(downloadedPage);
String pageContent = Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor.normalizeForSentenceSplitting(pageContent);
pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")//.replace(" ", ". ")
.replace("..", ".").replace(". . .", " ")
.trim(); // sometimes html breaks are converted into ' ' (two spaces), so we need to put '.'
sents = sm.splitSentences(snapshotMarked);;
sents = cleanListOfSents(sents);
}
}
}
catch (Exception e)
{
// TODO Auto-generated catch block
// e.printStackTrace();
System.err.println("Problem downloading the page and splitting into sentences");
return item;
}
for (String fragment : allFragms)
{
String followSent = null;
if (fragment.length() < 50)
continue;
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.indexOf("_should_find_orig_") > -1 && sents != null && sents.length > 0)
try
{
String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sents);
pageSentence = mainAndFollowSent[0];
followSent = mainAndFollowSent[1];
}
catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
pageSentence.replace("_should_find_orig_", "");
// resultant sentence SHOULD NOT be longer than twice the size of
// snippet fragment
if (pageSentence != null && (float) pageSentence.length() / (float) fragment.length() < 4.0)
{ // was 2.0, but since snippet sentences are rather short now...
try
{ // get score from syntactic match between sentence in
// original text and mined sentence
double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence + " " + title, originalSentence);
List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
if (!matchRes.isVerbExists() || matchRes.isImperativeVerb())
{
System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence);
continue;
}
syntScore =parseTreeChunkListScorer.getParseTreeChunkListScore(match);
System.out.println(parseTreeChunk.listToString(match) + " " + syntScore
+ "\n pre-processed sent = '" + pageSentence);
if (syntScore < 1.5)
{ // trying other sents
for (String currSent : sentsAll)
{
if (currSent.startsWith(originalSentence))
continue;
match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
if (syntScoreCurr > syntScore)
{
syntScore = syntScoreCurr;
}
}
if (syntScore > 1.5)
{
System.out.println("Got match with other sent: " + parseTreeChunk.listToString(match) + " "
+ syntScore);
}
}
measScore = STRING_DISTANCE_MEASURER.measureStringDistance(originalSentence, pageSentence);
// now possibly increase score by finding mental verbs
// indicating opinions
for (String s : MENTAL_VERBS)
{
if (pageSentence.indexOf(s) > -1)
{
mentalScore += 0.3;
break;
}
}
if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5) && measScore < 0.8
&& pageSentence.length() > 40) // >70
{
String pageSentenceProc = GeneratedSentenceProcessor.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null)
{
pageSentenceProc = GeneratedSentenceProcessor.processSentence(pageSentenceProc);
if (followSent != null)
{
pageSentenceProc += " " + GeneratedSentenceProcessor.processSentence(followSent);
}
pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
Fragment f = new Fragment(pageSentenceProc, syntScore + measScore + mentalScore
+ (double) pageSentenceProc.length() / (double) 50);
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
System.out.println("Accepted sentence: " + pageSentenceProc + "| with title= " + title);
System.out.println("For fragment = " + fragment);
}
else
System.out.println("Rejected sentence due to wrong area at webpage: " + pageSentence);
}
else
System.out.println("Rejected sentence due to low score: " + pageSentence);
// }
}
catch (Throwable t)
{
t.printStackTrace();
}
}
}
item.setFragments(result);
return item;
}
public static String[] cleanListOfSents(String[] sents)
{
List<String> sentsClean = new ArrayList<String>();
for (String s : sents)
{
if (s == null || s.trim().length() < 30 || s.length() < 20)
continue;
sentsClean.add(s);
}
return (String[]) sentsClean.toArray(new String[0]);
}
// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
{
if (fragment.trim().length() < 15)
return null;
StringDistanceMeasurer meas = new StringDistanceMeasurer();
Double dist = 0.0;
String result = null, followSent = null;
for (int i = 0; i < sents.length; i++)
{
String s = sents[i];
if (s == null || s.length() < 30)
continue;
Double distCurr = meas.measureStringDistance(s, fragment);
if (distCurr > dist && distCurr > 0.4)
{
result = s;
dist = distCurr;
if (i < sents.length - 1 && sents[i + 1].length() > 60)
{
followSent = sents[i + 1];
}
}
}
return new String[] { result, followSent };
}
// given a fragment from snippet, finds an original sentence at a webpage by optimizing alignmemt score
public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(String fragment, String[] sents)
{
if (fragment.trim().length() < 15)
return null;
int bestSentIndex = -1;
StringDistanceMeasurer meas = new StringDistanceMeasurer();
Double distBest = 10.0; // + sup
String result = null, followSent = null;
for (int i = 0; i < sents.length; i++)
{
String s = sents[i];
if (s == null || s.length() < 30)
continue;
Double distCurr = meas.measureStringDistance(s, fragment);
if (distCurr>distBest){
distBest = distCurr;
bestSentIndex = i;
}
}
if (distBest > 0.4)
{
result = sents[bestSentIndex];
if (bestSentIndex < sents.length - 1 && sents[bestSentIndex + 1].length() > 60)
{
followSent = sents[bestSentIndex + 1];
}
}
return new String[] { result, followSent };
}
public static void main(String[] args)
{
RelatedSentenceFinder f = new RelatedSentenceFinder();
List<HitBase> hits = null;
try
{
// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
// uncomment the sentence you would like to serve as a seed sentence for content generation for an event description
hits = f.generateContentAbout(
"Albert Einstein"
//"Britney Spears - The Femme Fatale Tour"
// "Rush Time Machine",
// "Blue Man Group" ,
// "Belly Dance With Zaharah",
// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
);
System.out.println(HitBase.toString(hits));
System.out.println(HitBase.toResultantString(hits));
//WordFileGenerator.createWordDoc("Essey about Albert Einstein", hits.get(0).getTitle(), hits);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}