blob: 2cd5a551365ad4a13fe82f6abb7951fc119a7727 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Logger;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.TextProcessor;
/*
* This class does content generation in ES, DE etc
*
*/
public class RelatedSentenceFinderML extends RelatedSentenceFinder{
private static Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinderML");
public RelatedSentenceFinderML(int ms, int msr, float thresh, String key) {
this.MAX_STEPS = ms;
this.MAX_SEARCH_RESULTS = msr;
this.RELEVANCE_THRESHOLD=thresh;
yrunner.setKey(key);
}
public RelatedSentenceFinderML() {
// TODO Auto-generated constructor stub
}
public List<HitBase> generateContentAbout(String sentence) throws Exception {
List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
System.out.println(" \n=== Entity to write about = " + sentence);
List<String> nounPhraseQueries = new ArrayList<String>();
List<HitBase> searchResult = yrunner.runSearch(sentence, 100);
if (MAX_SEARCH_RESULTS<searchResult.size())
searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS);
//TODO for shorter run
if (searchResult != null) {
for (HitBase item : searchResult) { // got some text from .html
if (item.getAbstractText() != null
&& !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
opinionSentencesToAdd
.add(augmentWithMinedSentencesAndVerifyRelevance(item,
sentence, null));
}
}
}
opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
return opinionSentencesToAdd;
}
/**
* Takes single search result for an entity which is the subject of the essay
* to be written and forms essey sentences from the title, abstract, and
* possibly original page
*
* @param HitBase
* item : search result
* @param originalSentence
* : seed for the essay to be written
* @param sentsAll
* : list<String> of other sentences in the seed if it is
* multi-sentence
* @return search result
*/
public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
String originalSentence, List<String> sentsAll) {
if (sentsAll == null)
sentsAll = new ArrayList<String>();
// put orig sentence in structure
List<String> origs = new ArrayList<String>();
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
.replace(" ", " ").replace(" ", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<Fragment>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
.replace("</b>", " ").replace(" ", " ").replace(" ", " ");
// fix a template expression which can be substituted by original if
// relevant
String snapshotMarked = snapshot.replace("...",
" _should_find_orig_ . _should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new ArrayList<String>();
allFragms.addAll(Arrays.asList(fragments));
String[] sents = null;
String downloadedPage = null;
try {
if (snapshotMarked.length() != snapshot.length()) {
downloadedPage = pFetcher.fetchPage(item.getUrl());
if (downloadedPage != null && downloadedPage.length() > 100) {
item.setPageContent(downloadedPage);
String pageContent = Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
.normalizeForSentenceSplitting(pageContent);
pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
// ". ")
.replace("..", ".").replace(". . .", " ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so
// we need to put '.'
sents = sm.splitSentences(pageContent);
sents = ContentGeneratorSupport.cleanListOfSents(sents);
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
// e.printStackTrace();
System.err
.println("Problem downloading the page and splitting into sentences");
return item;
}
for (String fragment : allFragms) {
String followSent = null;
if (fragment.length() < 50)
continue;
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
&& sents.length > 0)
try {
// first try sorted sentences from page by lenght approach
String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
String[] mainAndFollowSent = null;
try {
mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// if the above gives null than try to match all sentences from snippet fragment
if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sents);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
pageSentence.replace("_should_find_orig_", "");
// resultant sentence SHOULD NOT be longer than twice the size of
// snippet fragment
if (pageSentence != null
&& (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was 2.0,
try { // get score from syntactic match between sentence in
// original text and mined sentence
double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;
syntScore = calculateKeywordScore(pageSentence + " " + title, originalSentence);
if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
for (String currSent : sentsAll) {
if (currSent.startsWith(originalSentence))
continue;
double syntScoreCurr = calculateKeywordScore(currSent, pageSentence);
if (syntScoreCurr > syntScore) {
syntScore = syntScoreCurr;
}
}
if (syntScore > RELEVANCE_THRESHOLD) {
System.out.println("Got match with other sent: " + syntScore);
}
}
measScore = stringDistanceMeasurer.measureStringDistance(
originalSentence, pageSentence);
// now possibly increase score by finding mental verbs
// indicating opinions
for (String s : MENTAL_VERBS) {
if (pageSentence.indexOf(s) > -1) {
mentalScore += 0.3;
break;
}
}
if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5 || mentalScore > 0.5)
&& measScore < 0.8 && pageSentence.length() > 40) // >70
{
String pageSentenceProc = GeneratedSentenceProcessor
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc = GeneratedSentenceProcessor
.processSentence(pageSentenceProc);
if (followSent != null) {
pageSentenceProc += " "
+ GeneratedSentenceProcessor.processSentence(followSent);
}
pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+ mentalScore + (double) pageSentenceProc.length()
/ (double) 50);
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
System.out.println("Accepted sentence: " + pageSentenceProc
+ "| with title= " + title);
System.out.println("For fragment = " + fragment);
} else
System.out
.println("Rejected sentence due to wrong area at webpage: "
+ pageSentence);
} else
System.out.println("Rejected sentence due to low score: "
+ pageSentence);
// }
} catch (Throwable t) {
t.printStackTrace();
}
}
}
item.setFragments(result);
return item;
}
private double calculateKeywordScore(String currSent, String pageSentence) {
List<String> list1 =TextProcessor.fastTokenize(currSent, false);
List<String> list2 =TextProcessor.fastTokenize(pageSentence, false);
List<String> overlap1 = new ArrayList<String>(list1);
overlap1.retainAll(list2);
return overlap1.size();
}
public static void main(String[] args) {
RelatedSentenceFinderML f = new RelatedSentenceFinderML();
List<HitBase> hits = null;
try {
// uncomment the sentence you would like to serve as a seed sentence for
// content generation for an event description
// uncomment the sentence you would like to serve as a seed sentence for
// content generation for an event description
hits = f.generateContentAbout("Albert Einstein"
// "Britney Spears - The Femme Fatale Tour"
// "Rush Time Machine",
// "Blue Man Group" ,
// "Belly Dance With Zaharah",
// "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
// "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
);
System.out.println(HitBase.toString(hits));
System.out.println(HitBase.toResultantString(hits));
// WordFileGenerator.createWordDoc("Essey about Albert Einstein",
// hits.get(0).getTitle(), hits);
} catch (Exception e) {
e.printStackTrace();
}
}
}