| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.similarity.apps; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.commons.lang.StringUtils; |
| |
| import opennlp.tools.parse_thicket.Triple; |
| import opennlp.tools.similarity.apps.utils.PageFetcher; |
| import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; |
| import opennlp.tools.similarity.apps.utils.Utils; |
| import opennlp.tools.textsimilarity.ParseTreeChunk; |
| import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; |
| import opennlp.tools.textsimilarity.SentencePairMatchResult; |
| import opennlp.tools.textsimilarity.TextProcessor; |
| import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Provides content generation by using web mining and syntactic generalization to get sentences from the web, |
| * convert and combine them in the form expected to be readable by humans. |
| * <p> |
| * These are examples of generated articles, given the article title |
| * http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes |
| * http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area |
| * |
| */ |
| public class RelatedSentenceFinder { |
| |
| private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| final PageFetcher pFetcher = new PageFetcher(); |
| final ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor.getInstance(); |
| protected final ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); |
| protected final ParseTreeChunk parseTreeChunk = new ParseTreeChunk(); |
| protected static final StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer(); |
| protected final BingQueryRunner yrunner = new BingQueryRunner(); |
| protected int MAX_STEPS = 1; |
| protected int MAX_SEARCH_RESULTS = 1; |
| protected float RELEVANCE_THRESHOLD = 1.1f; |
| protected final Set<String> visitedURLs = new HashSet<>(); |
| |
| // used to indicate that a sentence is an opinion, so more appropriate |
| static final List<String> MENTAL_VERBS = Arrays.asList( |
| "want", "know", "believe", "appeal", "ask", |
| "accept", "agree", "allow", "appeal", "ask", "assume", "believe", |
| "check", "confirm", "convince", "deny", "disagree", "explain", |
| "ignore", "inform", "remind", "request", "suggest", "suppose", |
| "think", "threaten", "try", "understand"); |
| |
| public RelatedSentenceFinder(int ms, int msr, float thresh, String key) { |
| this.MAX_STEPS = ms; |
| this.MAX_SEARCH_RESULTS = msr; |
| this.RELEVANCE_THRESHOLD=thresh; |
| yrunner.setKey(key); |
| } |
| |
| int generateContentAboutIter = 0; |
| |
| public RelatedSentenceFinder() { |
| |
| } |
| |
| public void setLang(String lang) { |
| yrunner.setLang(lang); |
| |
| } |
| |
| public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word, List<String> sents) { |
| |
| return yrunner.runSearch(word, 100); |
| } |
| |
| public List<HitBase> findRelatedOpinionsForSentence(String sentence, List<String> sents) { |
| List<HitBase> opinionSentencesToAdd = new ArrayList<>(); |
| System.out.println(" \n\n=== Sentence = " + sentence); |
| List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence); |
| |
| BingQueryRunner yrunner = new BingQueryRunner(); |
| for (String query : nounPhraseQueries) { |
| System.out.println("\nquery = " + query); |
| // query += " "+join(MENTAL_VERBS, " OR ") ; |
| List<HitBase> searchResult = yrunner.runSearch(query, 100); |
| if (searchResult != null) { |
| for (HitBase item : searchResult) { // got some text from .html |
| if (item.getAbstractText() != null |
| && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude |
| // pdf |
| opinionSentencesToAdd |
| .add(augmentWithMinedSentencesAndVerifyRelevance(item, |
| sentence, sents)); |
| |
| } |
| } |
| } |
| } |
| |
| return removeDuplicatesFromResultantHits(opinionSentencesToAdd); |
| } |
| |
| /** |
| * Main content generation function which takes a seed as a person, rock |
| * group, or other entity name and produce a list of text fragments by web |
| * mining for <br> |
| * |
| * @param sentence |
| * entity name |
| * @return List<HitBase> of text fragment structures which contain approved |
| * (in terms of relevance) mined sentences, as well as original search |
| * results objects such as doc titles, abstracts, and urls. |
| */ |
| |
| public List<HitBase> generateContentAbout(String sentence) throws Exception { |
| List<HitBase> opinionSentencesToAdd = new ArrayList<>(); |
| System.out.println(" \n=== Entity to write about = " + sentence); |
| |
| String[] extraKeywords = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity(sentence); |
| System.out.println("Found extraKeywords "+ Arrays.asList(extraKeywords)); |
| if (extraKeywords==null || extraKeywords.length<1) |
| extraKeywords = StoryDiscourseNavigator.FREQUENT_PERFORMING_VERBS; |
| |
| int stepCount=0; |
| for (String verbAddition : extraKeywords) { |
| List<HitBase> searchResult = yrunner.runSearch(sentence + " " |
| + verbAddition, MAX_SEARCH_RESULTS); //100); |
| if (MAX_SEARCH_RESULTS<searchResult.size()) |
| searchResult = searchResult.subList(0, MAX_SEARCH_RESULTS); |
| //TODO for shorter run |
| if (searchResult != null) { |
| for (HitBase item : searchResult) { // got some text from .html |
| if (item.getAbstractText() != null |
| && !(item.getUrl().indexOf(".pdf") > 0) && !visitedURLs.contains(item.getUrl())) { // exclude pdf |
| opinionSentencesToAdd |
| .add(//augmentWithMinedSentencesAndVerifyRelevance(item, |
| // sentence, null)); |
| buildParagraphOfGeneratedText(item, sentence, null)); |
| visitedURLs.add(item.getUrl()); |
| } |
| } |
| } |
| stepCount++; |
| if (stepCount>MAX_STEPS) |
| break; |
| } |
| |
| // if nothing is written, then get first search result and try again |
| try { |
| if (generateContentAboutIter<4 && ContentGeneratorSupport.problematicHitList(opinionSentencesToAdd)){ |
| List<HitBase> resultList = yrunner.runSearch(sentence, 10); |
| String discoveredSimilarTopic = resultList.get(generateContentAboutIter).getTitle(); |
| discoveredSimilarTopic = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(discoveredSimilarTopic); |
| generateContentAboutIter++; |
| opinionSentencesToAdd = generateContentAbout(discoveredSimilarTopic); |
| } |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| |
| return removeDuplicatesFromResultantHits(opinionSentencesToAdd); |
| } |
| |
| /** |
| * Takes a sentence and extracts noun phrases and entity names to from search |
| * queries for finding relevant sentences on the web, which are then subject |
| * to relevance assessment by Similarity. Search queries should not be too |
| * general (irrelevant search results) or too specific (too few search |
| * results) |
| * |
| * @param sentence |
| * input sentence to form queries |
| * @return List<String> of search expressions |
| */ |
| public static List<String> buildSearchEngineQueryFromSentence(String sentence) { |
| ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance(); |
| |
| List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0); |
| List<String> queryArrayStr = new ArrayList<>(); |
| for (ParseTreeChunk ch : nPhrases) { |
| StringBuilder query = new StringBuilder(); |
| int size = ch.getLemmas().size(); |
| |
| for (int i = 0; i < size; i++) { |
| if (ch.getPOSs().get(i).startsWith("N") |
| || ch.getPOSs().get(i).startsWith("J")) { |
| query.append(ch.getLemmas().get(i)).append(" "); |
| } |
| } |
| query = new StringBuilder(query.toString().trim()); |
| int len = query.toString().split("\\s+").length; |
| if (len < 2 || len > 5) |
| continue; |
| if (len < 4) { // every word should start with capital |
| String[] qs = query.toString().split("\\s+"); |
| boolean bAccept = true; |
| for (String w : qs) { |
| if (w.toLowerCase().equals(w)) // idf only two words then |
| // has to be person name, |
| // title or geolocation |
| bAccept = false; |
| } |
| if (!bAccept) |
| continue; |
| } |
| |
| query = new StringBuilder(query.toString().trim().replace(" ", " +")); |
| query.insert(0, " +"); |
| |
| queryArrayStr.add(query.toString()); |
| |
| } |
| if (queryArrayStr.size() < 1) { // release constraints on NP down to 2 |
| // keywords |
| for (ParseTreeChunk ch : nPhrases) { |
| StringBuilder query = new StringBuilder(); |
| int size = ch.getLemmas().size(); |
| |
| for (int i = 0; i < size; i++) { |
| if (ch.getPOSs().get(i).startsWith("N") |
| || ch.getPOSs().get(i).startsWith("J")) { |
| query.append(ch.getLemmas().get(i)).append(" "); |
| } |
| } |
| query = new StringBuilder(query.toString().trim()); |
| int len = query.toString().split("\\s+").length; |
| if (len < 2) |
| continue; |
| |
| query = new StringBuilder(query.toString().trim().replace(" ", " +")); |
| query.insert(0, " +"); |
| |
| queryArrayStr.add(query.toString()); |
| |
| } |
| } |
| |
| queryArrayStr = removeDuplicatesFromQueries(queryArrayStr); |
| queryArrayStr.add(sentence); |
| |
| return queryArrayStr; |
| |
| } |
| |
| /** |
| * remove dupes from queries to easy cleaning dupes and repetitive search |
| * afterwards |
| * |
| * @param hits List<String> of sentences (search queries, or search results |
| * abstracts, or titles |
| * @return List<String> of sentences where dupes are removed |
| */ |
| public static List<String> removeDuplicatesFromQueries(List<String> hits) { |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dupeThresh = 0.8; // if more similar, then considered dupes was |
| // 0.7 |
| List<Integer> idsToRemove = new ArrayList<>(); |
| List<String> hitsDedup = new ArrayList<>(); |
| try { |
| for (int i = 0; i < hits.size(); i++) |
| for (int j = i + 1; j < hits.size(); j++) { |
| String title1 = hits.get(i); |
| String title2 = hits.get(j); |
| if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2)) |
| continue; |
| if (meas.measureStringDistance(title1, title2) > dupeThresh) { |
| idsToRemove.add(j); // dupes found, later list member to |
| // be deleted |
| |
| } |
| } |
| |
| for (int i = 0; i < hits.size(); i++) |
| if (!idsToRemove.contains(i)) |
| hitsDedup.add(hits.get(i)); |
| |
| if (hitsDedup.size() < hits.size()) { |
| LOG.info("Removed duplicates from formed query, including {}", hits.get(idsToRemove.get(0))); |
| } |
| |
| } catch (Exception e) { |
| LOG.error("Problem removing duplicates from query list", e); |
| } |
| |
| return hitsDedup; |
| |
| } |
| |
| /** |
| * remove dupes from search results |
| * |
| * @param hits List<HitBase> of search results objects |
| * @return List<String> of search results objects where dupes are removed |
| */ |
| public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits) { |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dupeThresh = // 0.8; // if more similar, then considered dupes was |
| 0.7; |
| try { |
| for (int i = 0; i < hits.size(); i++) |
| for (int j = i + 1; j < hits.size(); j++) { |
| HitBase hit2 = hits.get(j); |
| List<Fragment> fragmList1 = hits.get(i).getFragments(); |
| List<Fragment> fragmList2 = hits.get(j).getFragments(); |
| List<Fragment> fragmList2Results = new ArrayList<>(fragmList2); |
| for (Fragment f1 : fragmList1) |
| for (Fragment f2 : fragmList2) { |
| String sf1 = f1.getResultText(); |
| String sf2 = f2.getResultText(); |
| if (StringUtils.isEmpty(sf1)) |
| continue; |
| if (meas.measureStringDistance(sf1, sf2) > dupeThresh) { |
| fragmList2Results.remove(f2); |
| LOG.debug("Removed duplicates from formed fragments list: {}", sf2); |
| } |
| } |
| |
| hit2.setFragments(fragmList2Results); |
| hits.set(j, hit2); |
| } |
| } catch (Exception e) { |
| LOG.error("Problem removing duplicates from list of fragment", e); |
| } |
| return hits; |
| } |
| |
| /** |
| * Takes single search result for an entity which is the subject of the essay |
| * to be written and forms essay sentences from the title, abstract, and |
| * possibly original page |
| * |
| * @param item The HitBase search result |
| * @param originalSentence The seed for the essay to be written |
| * @param sentsAll |
| * : list<String> of other sentences in the seed if it is |
| * multi-sentence |
| * @return search result |
| */ |
| |
| public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, |
| String originalSentence, List<String> sentsAll) { |
| if (sentsAll == null) |
| sentsAll = new ArrayList<>(); |
| // put orig sentence in structure |
| List<String> origs = new ArrayList<>(); |
| origs.add(originalSentence); |
| item.setOriginalSentences(origs); |
| String title = item.getTitle().replace("<b>", " ").replace("</b>", " ") |
| .replace(" ", " ").replace(" ", " "); |
| // generation results for this sentence |
| List<Fragment> result = new ArrayList<>(); |
| // form plain text from snippet |
| String snapshot = item.getAbstractText().replace("<b>", " ") |
| .replace("</b>", " ").replace(" ", " ").replace(" ", " "); |
| |
| |
| // fix a template expression which can be substituted by original if |
| // relevant |
| String snapshotMarked = snapshot.replace("...", |
| " _should_find_orig_ . _should_find_orig_"); |
| String[] fragments = sm.splitSentences(snapshotMarked); |
| List<String> allFragms = new ArrayList<>(Arrays.asList(fragments)); |
| |
| String[] sents = null; |
| String downloadedPage = null; |
| try { |
| if (snapshotMarked.length() != snapshot.length()) { |
| downloadedPage = pFetcher.fetchPage(item.getUrl()); |
| if (downloadedPage != null && downloadedPage.length() > 100) { |
| item.setPageContent(downloadedPage); |
| String pageContent = Utils.fullStripHTML(item.getPageContent()); |
| pageContent = GeneratedSentenceProcessor |
| .normalizeForSentenceSplitting(pageContent); |
| pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent); |
| //pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ", |
| // // ". ") |
| // .replace("..", ".").replace(". . .", " ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so |
| // we need to put '.' |
| sents = sm.splitSentences(pageContent); |
| |
| sents = ContentGeneratorSupport.cleanListOfSents(sents); |
| } |
| } |
| } catch (Exception e) { |
| LOG.error("Problem downloading the page and splitting into sentences", e); |
| return item; |
| } |
| |
| for (String fragment : allFragms) { |
| StringBuilder followSent = new StringBuilder(); |
| if (fragment.length() < 50) |
| continue; |
| String pageSentence = ""; |
| // try to find original sentence from webpage |
| if (fragment.contains("_should_find_orig_") && sents != null && sents.length > 0){ |
| try { |
| // first try sorted sentences from page by length approach |
| String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage); |
| String[] mainAndFollowSent = null; |
| |
| try { |
| mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( |
| fragment.replace("_should_find_orig_", ""), sentsSortedByLength); |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| // if the above gives null than try to match all sentences from snippet fragment |
| if (mainAndFollowSent==null || mainAndFollowSent[0]==null){ |
| mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( |
| fragment.replace("_should_find_orig_", ""), sents); |
| } |
| |
| if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){ |
| pageSentence = mainAndFollowSent[0]; |
| for(int i = 1; i< mainAndFollowSent.length; i++) |
| if (mainAndFollowSent[i]!=null) |
| followSent.append(mainAndFollowSent[i]); |
| } |
| |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| } else |
| // or get original snippet |
| pageSentence = fragment; |
| if (pageSentence != null) |
| pageSentence = pageSentence.replace("_should_find_orig_", ""); |
| |
| // resultant sentence SHOULD NOT be longer than for times the size of |
| // snippet fragment |
| if (pageSentence != null && pageSentence.length()>50 ) |
| // && (float) pageSentence.length() / (float) fragment.length() < 4.0) |
| { // was 2.0, |
| |
| try { // get score from syntactic match between sentence in |
| // original text and mined sentence |
| double measScore, syntScore, mentalScore = 0.0; |
| |
| SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence |
| + " " + title, originalSentence); |
| List<List<ParseTreeChunk>> match = matchRes.getMatchResult(); |
| if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) { |
| LOG.debug("Rejected Sentence : No verb OR Yes imperative verb: {}", pageSentence); |
| continue; |
| } |
| |
| syntScore = parseTreeChunkListScorer.getParseTreeChunkListScore(match); |
| LOG.debug("{} {}\n pre-processed sent = '{}'", parseTreeChunk.listToString(match), syntScore, pageSentence); |
| |
| if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents |
| for (String currSent : sentsAll) { |
| if (currSent.startsWith(originalSentence)) |
| continue; |
| match = sm.assessRelevance(currSent, pageSentence).getMatchResult(); |
| double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match); |
| if (syntScoreCurr > syntScore) { |
| syntScore = syntScoreCurr; |
| } |
| } |
| if (syntScore > RELEVANCE_THRESHOLD) { |
| LOG.debug("Got match with other sent: {} {}", parseTreeChunk.listToString(match), syntScore); |
| } |
| } |
| |
| measScore = STRING_DISTANCE_MEASURER.measureStringDistance( |
| originalSentence, pageSentence); |
| |
| |
| if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5) |
| && measScore < 0.8 && pageSentence.length() > 40) // >70 |
| { |
| String pageSentenceProc = GeneratedSentenceProcessor |
| .acceptableMinedSentence(pageSentence); |
| if (pageSentenceProc != null) { |
| pageSentenceProc = GeneratedSentenceProcessor |
| .processSentence(pageSentenceProc); |
| followSent = new StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString())); |
| if (followSent != null) { |
| pageSentenceProc += " "+ followSent; |
| } |
| |
| pageSentenceProc = Utils.convertToASCII(pageSentenceProc); |
| Fragment f = new Fragment(pageSentenceProc, syntScore + measScore |
| + mentalScore + (double) pageSentenceProc.length() |
| / (double) 50); |
| f.setSourceURL(item.getUrl()); |
| f.fragment = fragment; |
| result.add(f); |
| LOG.debug("Accepted sentence: {} | {} | with title = {}", pageSentenceProc, followSent, title); |
| LOG.debug("For fragment = {}", fragment); |
| } else |
| LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence); |
| } else |
| LOG.debug("Rejected sentence due to low score: {}", pageSentence); |
| // } |
| } catch (Throwable t) { |
| LOG.error(t.getLocalizedMessage(), t); |
| } |
| } |
| } |
| item.setFragments(result); |
| return item; |
| } |
| |
| |
| // given a fragment from snippet, finds an original sentence at a webpage by |
| // optimizing alignment score |
| public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment( |
| String fragment, String[] sents) { |
| if (fragment.trim().length() < 15) |
| return null; |
| |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dist = 0.0; |
| String result = null; |
| StringBuilder followSent = new StringBuilder(); |
| for (int i = 0; i < sents.length; i++) { |
| String s = sents[i]; |
| if (s == null || s.length() < 30) |
| continue; |
| double distCurr = meas.measureStringDistance(s, fragment); |
| if (distCurr > dist && distCurr > 0.4) { |
| result = s; |
| dist = distCurr; |
| try { |
| if (i < sents.length - 1 && sents[i + 1].length() > 60) { |
| String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]); |
| if (f1!=null){ |
| followSent = new StringBuilder(f1); |
| } |
| } |
| |
| if (i < sents.length - 2 && sents[i + 2].length() > 60) { |
| String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]); |
| if (f2!=null){ |
| followSent.append(" ").append(f2); |
| } |
| } |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| } |
| } |
| return new String[] { result, followSent.toString()}; |
| } |
| |
| // given a fragment from snippet, finds an original sentence at a webpage by |
| // optimizing alignmemt score |
| public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment( |
| String fragment, String[] sents) { |
| if (fragment.trim().length() < 15) |
| return null; |
| int bestSentIndex = -1; |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double distBest = 10.0; // + sup |
| String result = null, followSent = null; |
| for (int i = 0; i < sents.length; i++) { |
| String s = sents[i]; |
| if (s == null || s.length() < 30) |
| continue; |
| double distCurr = meas.measureStringDistance(s, fragment); |
| if (distCurr > distBest) { |
| distBest = distCurr; |
| bestSentIndex = i; |
| } |
| |
| } |
| if (distBest > 0.4) { |
| result = sents[bestSentIndex]; |
| |
| if (bestSentIndex < sents.length - 1 |
| && sents[bestSentIndex + 1].length() > 60) { |
| followSent = sents[bestSentIndex + 1]; |
| } |
| |
| } |
| |
| return new String[] { result, followSent }; |
| } |
| |
| public String[] extractSentencesFromPage(String downloadedPage) { |
| |
| int maxSentsFromPage= 100; |
| |
| downloadedPage= downloadedPage.replace(" ", "&"); |
| downloadedPage = downloadedPage.replaceAll("(?:&)+", "#"); |
| String[] sents = downloadedPage.split("#"); |
| List<TextChunk> sentsList = new ArrayList<>(); |
| for(String s: sents){ |
| s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s); |
| /* s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ") |
| .replace(": ", ". ").replace("- ", ". "). |
| replace (". .",".").trim(); */ |
| sentsList.add(new TextChunk(s, s.length())); |
| } |
| sentsList.sort(new TextChunkComparable()); |
| |
| String[] longestSents = new String[maxSentsFromPage]; |
| int j=0; |
| int initIndex = sentsList.size()-1 -maxSentsFromPage; |
| if (initIndex<0) |
| initIndex = 0; |
| for(int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++){ |
| longestSents[j] = sentsList.get(i).text; |
| j++; |
| } |
| |
| sents = cleanSplitListOfSents(longestSents); |
| |
| //sents = removeDuplicates(sents); |
| //sents = verifyEnforceStartsUpperCase(sents); |
| |
| return sents; |
| } |
| |
| public static class TextChunk { |
| public TextChunk(String s, int length) { |
| this.text = s; |
| this.len = length; |
| } |
| |
| public final String text; |
| public final int len; |
| } |
| |
| public static class TextChunkComparable implements Comparator<TextChunk> { |
| @Override |
| public int compare(TextChunk ch1, TextChunk ch2) { |
| return Integer.compare(ch1.len, ch2.len); |
| } |
| } |
| |
| protected String[] cleanSplitListOfSents(String[] longestSents){ |
| float minFragmentLength = 40, minFragmentLengthSpace=4; |
| |
| List<String> sentsClean = new ArrayList<>(); |
| for (String sentenceOrMultSent : longestSents) { |
| if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20) |
| continue; |
| if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){ |
| //System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent); |
| continue; |
| } |
| // aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n. |
| int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length; |
| float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLength) |
| continue; |
| // o oo o ooo o o o ooo oo ooo o o oo |
| numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length; |
| avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace) |
| continue; |
| |
| List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent); |
| |
| // forced split by ',' somewhere in the middle of sentence |
| // disused - Feb 26 13 |
| //furtherSplit = furtherMakeSentencesShorter(furtherSplit); |
| furtherSplit.remove(furtherSplit.size()-1); |
| for(String s : furtherSplit){ |
| if (s.indexOf('|')>-1) |
| continue; |
| s = s.replace("<em>"," ").replace("</em>"," "); |
| s = Utils.convertToASCII(s); |
| sentsClean.add(s); |
| } |
| } |
| return sentsClean.toArray(new String[0]); |
| } |
| |
| public Triple<List<String>, String, String[]> formCandidateFragmentsForPage(HitBase item, String originalSentence, List<String> sentsAll){ |
| // put orig sentence in structure |
| List<String> origs = new ArrayList<>(); |
| origs.add(originalSentence); |
| item.setOriginalSentences(origs); |
| String title = item.getTitle().replace("<b>", " ").replace("</b>", " ") |
| .replace(" ", " ").replace(" ", " "); |
| // generation results for this sentence |
| // form plain text from snippet |
| String snapshot = item.getAbstractText().replace("<b>", " ") |
| .replace("</b>", " ").replace(" ", " ").replace(" ", " "); |
| |
| // fix a template expression which can be substituted by original if |
| // relevant |
| String snapshotMarked = snapshot.replace("...", |
| " _should_find_orig_ . _should_find_orig_"); |
| String[] fragments = sm.splitSentences(snapshotMarked); |
| List<String> allFragms = new ArrayList<>(Arrays.asList(fragments)); |
| |
| String[] sents = null; |
| String downloadedPage = null; |
| try { |
| if (snapshotMarked.length() != snapshot.length()) { |
| downloadedPage = pFetcher.fetchPage(item.getUrl()); |
| if (downloadedPage != null && downloadedPage.length() > 100) { |
| item.setPageContent(downloadedPage); |
| String pageContent = Utils.fullStripHTML(item.getPageContent()); |
| pageContent = GeneratedSentenceProcessor |
| .normalizeForSentenceSplitting(pageContent); |
| pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent); |
| //pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ", |
| // // ". ") |
| // .replace("..", ".").replace(". . .", " "). |
| // replace(". .",". ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so |
| // we need to put '.' |
| sents = sm.splitSentences(pageContent); |
| |
| sents = ContentGeneratorSupport.cleanListOfSents(sents); |
| } |
| } |
| } catch (Exception e) { |
| LOG.error("Problem downloading the page and splitting into sentences", e); |
| return new Triple<>(allFragms, downloadedPage, sents); |
| } |
| return new Triple<>(allFragms, downloadedPage, sents); |
| } |
| |
| String[] formCandidateSentences(String fragment, Triple<List<String>, String, String[]> fragmentExtractionResults){ |
| String[] mainAndFollowSent = null; |
| |
| String downloadedPage = fragmentExtractionResults.getSecond(); |
| String[] sents = fragmentExtractionResults.getThird(); |
| |
| if (fragment.length() < 50) |
| return null; |
| String pageSentence = ""; |
| // try to find original sentence from webpage |
| if (fragment.contains("_should_find_orig_") && sents != null |
| && sents.length > 0){ |
| try { |
| // first try sorted sentences from page by length approach |
| String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage); |
| |
| try { |
| mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( |
| fragment.replace("_should_find_orig_", ""), sentsSortedByLength); |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| // if the above gives null than try to match all sentences from snippet fragment |
| if (mainAndFollowSent==null || mainAndFollowSent[0]==null){ |
| mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment( |
| fragment.replace("_should_find_orig_", ""), sents); |
| } |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| } else |
| // or get original snippet |
| pageSentence = fragment; |
| if (pageSentence != null) |
| pageSentence.replace("_should_find_orig_", ""); |
| |
| return mainAndFollowSent; |
| |
| } |
| |
| private Fragment verifyCandidateSentencesAndFormParagraph( |
| String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) { |
| Fragment result = null; |
| |
| String pageSentence = candidateSentences[0]; |
| StringBuilder followSent = new StringBuilder(); |
| for(int i = 1; i< candidateSentences.length; i++) |
| followSent.append(candidateSentences[i]); |
| String title = item.getTitle(); |
| |
| // resultant sentence SHOULD NOT be longer than for times the size of |
| // snippet fragment |
| if (!(pageSentence != null && pageSentence.length()>50) ){ |
| LOG.debug("Cannot accept the sentence = "+ pageSentence + |
| "!(pageSentence != null && pageSentence.length()>50 && (float) pageSentence.length() / (float) fragment.length() < 4.0) )"); |
| |
| return null; |
| } |
| |
| |
| try { // get score from syntactic match between sentence in |
| // original text and mined sentence |
| double measScore, syntScore, mentalScore = 0.0; |
| |
| SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence |
| + " " + title, originalSentence); |
| List<List<ParseTreeChunk>> match = matchRes.getMatchResult(); |
| if (match==null || match.size()<1){ |
| LOG.debug("Rejected Sentence : empty match {}", pageSentence); |
| return null; |
| } |
| |
| if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) { |
| LOG.debug("Rejected Sentence : No verb OR Yes imperative verb: {}", pageSentence); |
| return null; |
| } |
| |
| syntScore = parseTreeChunkListScorer.getParseTreeChunkListScore(match); |
| LOG.debug("{} {}\n pre-processed sent = '{}'",parseTreeChunk.listToString(match), syntScore, pageSentence); |
| |
| try { |
| if (sentsAll!=null && syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents |
| for (String currSent : sentsAll) { |
| if (currSent.startsWith(originalSentence)) |
| continue; |
| match = sm.assessRelevance(currSent, pageSentence).getMatchResult(); |
| double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match); |
| if (syntScoreCurr > syntScore) { |
| syntScore = syntScoreCurr; |
| } |
| } |
| if (syntScore > RELEVANCE_THRESHOLD) { |
| LOG.debug("Got match with other sent: {} {}", parseTreeChunk.listToString(match), syntScore); |
| } |
| } |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| |
| measScore = STRING_DISTANCE_MEASURER.measureStringDistance( |
| originalSentence, pageSentence); |
| |
| |
| if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5) |
| && measScore < 0.8 && pageSentence.length() > 40) // >70 |
| { |
| String pageSentenceProc = GeneratedSentenceProcessor |
| .acceptableMinedSentence(pageSentence); |
| if (pageSentenceProc != null) { |
| pageSentenceProc = GeneratedSentenceProcessor |
| .processSentence(pageSentenceProc); |
| followSent = new StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString())); |
| if (followSent != null) { |
| pageSentenceProc += " "+ followSent; |
| } |
| |
| pageSentenceProc = Utils.convertToASCII(pageSentenceProc); |
| result = new Fragment(pageSentenceProc, syntScore + measScore |
| + mentalScore + (double) pageSentenceProc.length() / (double) 50); |
| result.setSourceURL(item.getUrl()); |
| result.fragment = fragment; |
| |
| LOG.debug("Accepted sentence: {} | with title = {}", pageSentenceProc, title); |
| LOG.debug("For fragment = {}", fragment); |
| } else |
| LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence); |
| } else |
| LOG.debug("Rejected sentence due to low score: {}", pageSentence); |
| // } |
| } catch (Throwable t) { |
| LOG.error(t.getLocalizedMessage(), t); |
| } |
| |
| return result; |
| } |
| |
| public HitBase buildParagraphOfGeneratedText(HitBase item, |
| String originalSentence, List<String> sentsAll) { |
| List<Fragment> results = new ArrayList<>() ; |
| |
| Triple<List<String>, String, String[]> fragmentExtractionResults = formCandidateFragmentsForPage(item, originalSentence, sentsAll); |
| |
| List<String> allFragms = fragmentExtractionResults.getFirst(); |
| |
| for (String fragment : allFragms) { |
| String[] candidateSentences = formCandidateSentences(fragment, fragmentExtractionResults); |
| if (candidateSentences == null) |
| continue; |
| Fragment res = verifyCandidateSentencesAndFormParagraph(candidateSentences, item, fragment, originalSentence, sentsAll); |
| if (res!=null) |
| results.add(res); |
| |
| } |
| item.setFragments(results); |
| return item; |
| } |
| |
| |
| public static void main(String[] args) { |
| RelatedSentenceFinder f = new RelatedSentenceFinder(); |
| |
| List<HitBase> hits; |
| try { |
| // uncomment the sentence you would like to serve as a seed sentence for |
| // content generation for an event description |
| |
| // uncomment the sentence you would like to serve as a seed sentence for |
| // content generation for an event description |
| hits = f.generateContentAbout("Albert Einstein" |
| // "Britney Spears - The Femme Fatale Tour" |
| // "Rush Time Machine", |
| // "Blue Man Group" , |
| // "Belly Dance With Zaharah", |
| // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer", |
| // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis", |
| ); |
| LOG.info(HitBase.toString(hits)); |
| LOG.info(HitBase.toResultantString(hits)); |
| // WordFileGenerator.createWordDoc("Essey about Albert Einstein", |
| // hits.get(0).getTitle(), hits); |
| |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| |
| } |
| |
| } |