blob: 4bff64f3a82a7ea884d77f47804fef042b89e73c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Logger;
import opennlp.tools.parse_thicket.Triple;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
/*
* This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine
* them in the form
* expected to be readable by humans and not distinguishable from genuine content by search engines
*
*/
public class ContentGenerator /*extends RelatedSentenceFinder*/ {
private static Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.ContentGenerator");
PageFetcher pFetcher = new PageFetcher();
ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
.getInstance();
protected ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
protected ParseTreeChunk parseTreeChunk = new ParseTreeChunk();
protected static StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer();
protected BingQueryRunner yrunner = new BingQueryRunner();
protected ContentGeneratorSupport support = new ContentGeneratorSupport();
protected int MAX_STEPS = 1;
protected int MAX_SEARCH_RESULTS = 1;
protected float RELEVANCE_THRESHOLD = 1.1f;
//private static final int MAX_FRAGMENT_SENTS = 10;
public ContentGenerator(int ms, int msr, float thresh, String key) {
this.MAX_STEPS = ms;
this.MAX_SEARCH_RESULTS = msr;
this.RELEVANCE_THRESHOLD=thresh;
yrunner.setKey(key);
}
public ContentGenerator() {
// TODO Auto-generated constructor stub
}
public void setLang(String lang) {
yrunner.setLang(lang);
}
/**
* Main content generation function which takes a seed as a person, rock
* group, or other entity name and produce a list of text fragments by web
* mining for <br>
*
* @param String
* entity name
* @return List<HitBase> of text fragment structures which contain approved
* (in terms of relevance) mined sentences, as well as original search
* results objects such as doc titles, abstracts, and urls.
*/
public List<HitBase> generateContentAbout(String sentence) throws Exception {
List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
System.out.println(" \n=== Entity to write about = " + sentence);
int stepCount=0;
for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
List<HitBase> searchResult = yrunner.runSearch(sentence + " "
+ verbAddition, MAX_SEARCH_RESULTS); //100);
if (MAX_SEARCH_RESULTS<searchResult.size())
searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
//TODO for shorter run
if (searchResult != null) {
for (HitBase item : searchResult) { // got some text from .html
if (item.getAbstractText() != null
&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
opinionSentencesToAdd
.add(buildParagraphOfGeneratedText(item, sentence, null));
}
}
}
stepCount++;
if (stepCount>MAX_STEPS)
break;
}
opinionSentencesToAdd = ContentGeneratorSupport.removeDuplicatesFromResultantHits(opinionSentencesToAdd);
return opinionSentencesToAdd;
}
/**
* Takes a sentence and extracts noun phrases and entity names to from search
* queries for finding relevant sentences on the web, which are then subject
* to relevance assessment by Similarity. Search queries should not be too
* general (irrelevant search results) or too specific (too few search
* results)
*
* @param String
* input sentence to form queries
* @return List<String> of search expressions
*/
public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
ParseTreeChunk matcher = new ParseTreeChunk();
ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
.getInstance();
List<List<ParseTreeChunk>> sent1GrpLst = null;
List<ParseTreeChunk> nPhrases = pos
.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
List<String> queryArrayStr = new ArrayList<String>();
for (ParseTreeChunk ch : nPhrases) {
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2 || len > 5)
continue;
if (len < 4) { // every word should start with capital
String[] qs = query.split(" ");
boolean bAccept = true;
for (String w : qs) {
if (w.toLowerCase().equals(w)) // idf only two words then
// has to be person name,
// title or geo location
bAccept = false;
}
if (!bAccept)
continue;
}
query = query.trim().replace(" ", " +");
query = " +" + query;
queryArrayStr.add(query);
}
if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
// keywords
for (ParseTreeChunk ch : nPhrases) {
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2)
continue;
query = query.trim().replace(" ", " +");
query = " +" + query;
queryArrayStr.add(query);
}
}
queryArrayStr = ContentGeneratorSupport.removeDuplicatesFromQueries(queryArrayStr);
queryArrayStr.add(sentence);
return queryArrayStr;
}
private Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){
if (sentsAll == null)
sentsAll = new ArrayList<String>();
// put orig sentence in structure
List<String> origs = new ArrayList<String>();
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
.replace(" ", " ").replace(" ", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<Fragment>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
.replace("</b>", " ").replace(" ", " ").replace(" ", " ");
// fix a template expression which can be substituted by original if
// relevant
String snapshotMarked = snapshot.replace("...",
" _should_find_orig_ . _should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new ArrayList<String>();
allFragms.addAll(Arrays.asList(fragments));
String[] sents = null;
String downloadedPage = null;
try {
if (snapshotMarked.length() != snapshot.length()) {
downloadedPage = pFetcher.fetchPage(item.getUrl());
if (downloadedPage != null && downloadedPage.length() > 100) {
item.setPageContent(downloadedPage);
String pageContent = Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
.normalizeForSentenceSplitting(pageContent);
pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
sents = sm.splitSentences(pageContent);
sents = ContentGeneratorSupport.cleanListOfSents(sents);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.err
.println("Problem downloading the page and splitting into sentences");
return new Triple(allFragms, downloadedPage, sents);
}
return new Triple(allFragms, downloadedPage, sents);
}
private String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){
String[] mainAndFollowSent = null;
List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
String downloadedPage = (String)fragmentExtractionResults.getSecond();
String[] sents = (String[])fragmentExtractionResults.getThird();
String followSent = null;
if (fragment.length() < 50)
return null;
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
&& sents.length > 0){
try {
// first try sorted sentences from page by length approach
String[] sentsSortedByLength = support.extractSentencesFromPage(downloadedPage);
try {
mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// if the above gives null than try to match all sentences from snippet fragment
if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
mainAndFollowSent = ContentGeneratorSupport.getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sents);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
pageSentence.replace("_should_find_orig_", "");
return mainAndFollowSent;
}
private Fragment verifyCandidateSentencesAndFormParagraph(
String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
Fragment result = null;
String pageSentence = candidateSentences[0];
String followSent = "";
for(int i = 1; i< candidateSentences.length; i++)
followSent+= candidateSentences[i];
String title = item.getTitle();
// resultant sentence SHOULD NOT be longer than for times the size of
// snippet fragment
if (!(pageSentence != null && pageSentence.length()>50
&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )
return null;
try { // get score from syntactic match between sentence in
// original text and mined sentence
double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+ " " + title, originalSentence);
List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
System.out
.println("Rejected Sentence : No verb OR Yes imperative verb :"
+ pageSentence);
return null;
}
syntScore = parseTreeChunkListScorer
.getParseTreeChunkListScore(match);
System.out.println(parseTreeChunk.listToString(match) + " "
+ syntScore + "\n pre-processed sent = '" + pageSentence);
if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
for (String currSent : sentsAll) {
if (currSent.startsWith(originalSentence))
continue;
match = sm.assessRelevance(currSent, pageSentence)
.getMatchResult();
double syntScoreCurr = parseTreeChunkListScorer
.getParseTreeChunkListScore(match);
if (syntScoreCurr > syntScore) {
syntScore = syntScoreCurr;
}
}
if (syntScore > RELEVANCE_THRESHOLD) {
System.out.println("Got match with other sent: "
+ parseTreeChunk.listToString(match) + " " + syntScore);
}
}
measScore = stringDistanceMeasurer.measureStringDistance(
originalSentence, pageSentence);
if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
&& measScore < 0.8 && pageSentence.length() > 40) // >70
{
String pageSentenceProc = GeneratedSentenceProcessor
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc = GeneratedSentenceProcessor
.processSentence(pageSentenceProc);
followSent = GeneratedSentenceProcessor.processSentence(followSent);
if (followSent != null) {
pageSentenceProc += " "+ followSent;
}
pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
result = new Fragment(pageSentenceProc, syntScore + measScore
+ mentalScore + (double) pageSentenceProc.length()
/ (double) 50);
result.setSourceURL(item.getUrl());
result.fragment = fragment;
System.out.println("Accepted sentence: " + pageSentenceProc
+ "| with title= " + title);
System.out.println("For fragment = " + fragment);
} else
System.out
.println("Rejected sentence due to wrong area at webpage: "
+ pageSentence);
} else
System.out.println("Rejected sentence due to low score: "
+ pageSentence);
// }
} catch (Throwable t) {
t.printStackTrace();
}
return result;
}
/**
* Takes single search result for an entity which is the subject of the essay
* to be written and forms essey sentences from the title, abstract, and
* possibly original page
*
* @param HitBase
* item : search result
* @param originalSentence
* : seed for the essay to be written
* @param sentsAll
* : list<String> of other sentences in the seed if it is
* multi-sentence
* @return search result
*/
public HitBase buildParagraphOfGeneratedText(HitBase item,
String originalSentence, List<String> sentsAll) {
List<Fragment> results = new ArrayList<Fragment>() ;
Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll);
List<String> allFragms = (List<String>)fragmentExtractionResults.getFirst();
String downloadedPage = (String)fragmentExtractionResults.getSecond();
String[] sents = (String[])fragmentExtractionResults.getThird();
for (String fragment : allFragms) {
String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults);
if (candidateSentences == null)
continue;
Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll);
if (res!=null)
results.add(res);
}
item.setFragments(results );
return item;
}
public static void main(String[] args) {
ContentGenerator f = new ContentGenerator();
List<HitBase> hits = null;
try {
// uncomment the sentence you would like to serve as a seed sentence for
// content generation for an event description
// uncomment the sentence you would like to serve as a seed sentence for
// content generation for an event description
hits = f.generateContentAbout("Albert Einstein"
// "Britney Spears - The Femme Fatale Tour"
// "Rush Time Machine",
// "Blue Man Group" ,
// "Belly Dance With Zaharah",
// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
);
System.out.println(HitBase.toString(hits));
System.out.println(HitBase.toResultantString(hits));
// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
// hits.get(0).getTitle(), hits);
} catch (Exception e) {
e.printStackTrace();
}
}
}