| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.similarity.apps; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; |
| import opennlp.tools.similarity.apps.utils.Utils; |
| import opennlp.tools.textsimilarity.ParseTreeChunk; |
| import opennlp.tools.textsimilarity.TextProcessor; |
| import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * This class supports content generation by static functions. |
| */ |
| public class ContentGeneratorSupport { |
| private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| //TODO - verify regexp!! |
| private static final Pattern SPACES_PATTERN = Pattern.compile("([a-z])(\\s{2,3})([A-Z])"); |
| |
| /** |
| * Takes a sentence and extracts noun phrases and entity names to from search |
| * queries for finding relevant sentences on the web, which are then subject |
| * to relevance assessment by Similarity. Search queries should not be too |
| * general (irrelevant search results) or too specific (too few search |
| * results) |
| * |
| * @param sentence The input sentence to form queries |
| * @return List<String> of search expressions |
| */ |
| public static List<String> buildSearchEngineQueryFromSentence(String sentence) { |
| ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance(); |
| |
| List<ParseTreeChunk> nPhrases = pos |
| .formGroupedPhrasesFromChunksForSentence(sentence).get(0); |
| List<String> queryArrayStr = new ArrayList<>(); |
| for (ParseTreeChunk ch : nPhrases) { |
| String query = ""; |
| int size = ch.getLemmas().size(); |
| |
| for (int i = 0; i < size; i++) { |
| if (ch.getPOSs().get(i).startsWith("N") |
| || ch.getPOSs().get(i).startsWith("J")) { |
| query += ch.getLemmas().get(i) + " "; |
| } |
| } |
| query = query.trim(); |
| int len = query.split(" ").length; |
| if (len < 2 || len > 5) |
| continue; |
| if (len < 4) { // every word should start with capital |
| String[] qs = query.split(" "); |
| boolean bAccept = true; |
| for (String w : qs) { |
| if (w.toLowerCase().equals(w)) // idf only two words then |
| // has to be person name, |
| // title or geolocation |
| bAccept = false; |
| } |
| if (!bAccept) |
| continue; |
| } |
| |
| query = query.trim().replace(" ", " +"); |
| query = " +" + query; |
| |
| queryArrayStr.add(query); |
| |
| } |
| if (queryArrayStr.size() < 1) { // release constraints on NP down to 2 |
| // keywords |
| for (ParseTreeChunk ch : nPhrases) { |
| String query = ""; |
| int size = ch.getLemmas().size(); |
| |
| for (int i = 0; i < size; i++) { |
| if (ch.getPOSs().get(i).startsWith("N") |
| || ch.getPOSs().get(i).startsWith("J")) { |
| query += ch.getLemmas().get(i) + " "; |
| } |
| } |
| query = query.trim(); |
| int len = query.split(" ").length; |
| if (len < 2) |
| continue; |
| |
| query = query.trim().replace(" ", " +"); |
| query = " +" + query; |
| |
| queryArrayStr.add(query); |
| |
| } |
| } |
| |
| queryArrayStr = removeDuplicatesFromQueries(queryArrayStr); |
| queryArrayStr.add(sentence); |
| |
| return queryArrayStr; |
| |
| } |
| |
| public static String[] cleanListOfSents(String[] sents) { |
| List<String> sentsClean = new ArrayList<>(); |
| for (String s : sents) { |
| if (s == null || s.trim().length() < 30 || s.length() < 20) |
| continue; |
| sentsClean.add(s); |
| } |
| return sentsClean.toArray(new String[0]); |
| } |
| |
| public static String cleanSpacesInCleanedHTMLpage(String pageContent){ //was 4 spaces |
| //was 3 spaces => now back to 2 |
| pageContent = pageContent.trim(); |
| pageContent = SPACES_PATTERN.matcher(pageContent).replaceAll("$1. $3") |
| .replace("..", ".").replace(". . .", " ") |
| .replace(". .",". ").trim(); |
| // sometimes html breaks are converted into ' ' (two spaces), so |
| // we need to put '.' |
| return pageContent; |
| } |
| |
| /** |
| * remove dupes from queries to easy cleaning dupes and repetitive search |
| * afterwards |
| * |
| * @param hits A List<String> of sentences (search queries, or search results |
| * abstracts, or titles |
| * @return List<String> of sentences where dupes are removed |
| */ |
| public static List<String> removeDuplicatesFromQueries(List<String> hits) { |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dupeThresh = 0.8; // if more similar, then considered dupes was |
| // 0.7 |
| List<Integer> idsToRemove = new ArrayList<>(); |
| List<String> hitsDedup = new ArrayList<>(); |
| try { |
| for (int i = 0; i < hits.size(); i++) |
| for (int j = i + 1; j < hits.size(); j++) { |
| String title1 = hits.get(i); |
| String title2 = hits.get(j); |
| if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2)) |
| continue; |
| if (meas.measureStringDistance(title1, title2) > dupeThresh) { |
| idsToRemove.add(j); // dupes found, later list member to be deleted |
| } |
| } |
| |
| for (int i = 0; i < hits.size(); i++) |
| if (!idsToRemove.contains(i)) |
| hitsDedup.add(hits.get(i)); |
| |
| if (hitsDedup.size() < hits.size()) { |
| LOG.info("Removed duplicates from formed query, including {}", hits.get(idsToRemove.get(0))); |
| } |
| |
| } catch (Exception e) { |
| LOG.error("Problem removing duplicates from query list", e); |
| } |
| |
| return hitsDedup; |
| |
| } |
| |
| /** |
| * remove dupes from search results |
| * |
| * @param hits A List<HitBase> of search results objects |
| * @return List<String> of search results objects where dupes are removed |
| */ |
| public static List<HitBase> removeDuplicatesFromResultantHits(List<HitBase> hits) { |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dupeThresh = // 0.8; // if more similar, then considered dupes was |
| 0.7; |
| try { |
| for (int i = 0; i < hits.size(); i++) |
| for (int j = i + 1; j < hits.size(); j++) { |
| HitBase hit2 = hits.get(j); |
| List<Fragment> fragmList1 = hits.get(i).getFragments(); |
| List<Fragment> fragmList2 = hits.get(j).getFragments(); |
| List<Fragment> fragmList2Results = new ArrayList<>(fragmList2); |
| for (Fragment f1 : fragmList1) |
| for (Fragment f2 : fragmList2) { |
| String sf1 = f1.getResultText(); |
| String sf2 = f2.getResultText(); |
| if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf2)) |
| continue; |
| if (meas.measureStringDistance(sf1, sf2) > dupeThresh) { |
| fragmList2Results.remove(f2); |
| LOG.info("Removed duplicates from formed fragments list: {}", sf2); |
| } |
| } |
| |
| hit2.setFragments(fragmList2Results); |
| hits.set(j, hit2); |
| } |
| } catch (Exception e) { |
| LOG.error("Problem removing duplicates from list of fragment", e); |
| } |
| return hits; |
| } |
| |
| |
| |
| // given a fragment from snippet, finds an original sentence at a webpage by |
| // optimizing alignment score |
| public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment( |
| String fragment, String[] sents) { |
| if (fragment.trim().length() < 15) |
| return null; |
| |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double dist = 0.0; |
| String result = null, followSent = ""; |
| for (int i = 0; i < sents.length; i++) { |
| String s = sents[i]; |
| if (s == null || s.length() < 30) |
| continue; |
| double distCurr = meas.measureStringDistance(s, fragment); |
| if (distCurr > dist && distCurr > 0.4) { |
| result = s; |
| dist = distCurr; |
| try { |
| if (i < sents.length - 1 && sents[i + 1].length() > 60) { |
| String f1 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+1]); |
| if (f1!=null){ |
| followSent = f1; |
| } |
| } |
| |
| if (i < sents.length - 2 && sents[i + 2].length() > 60) { |
| String f2 = GeneratedSentenceProcessor.acceptableMinedSentence(sents[i+2]); |
| if (f2!=null){ |
| followSent += " "+f2; |
| } |
| } |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| } |
| } |
| return new String[] { result, followSent }; |
| } |
| |
| // given a fragment from snippet, finds an original sentence at a webpage by |
| // optimizing alignment score |
| public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment( |
| String fragment, String[] sents) { |
| if (fragment.trim().length() < 15) |
| return null; |
| int bestSentIndex = -1; |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| double distBest = 10.0; // + sup |
| String result = null, followSent = null; |
| for (int i = 0; i < sents.length; i++) { |
| String s = sents[i]; |
| if (s == null || s.length() < 30) |
| continue; |
| double distCurr = meas.measureStringDistance(s, fragment); |
| if (distCurr > distBest) { |
| distBest = distCurr; |
| bestSentIndex = i; |
| } |
| |
| } |
| if (distBest > 0.4) { |
| result = sents[bestSentIndex]; |
| |
| if (bestSentIndex < sents.length - 1 |
| && sents[bestSentIndex + 1].length() > 60) { |
| followSent = sents[bestSentIndex + 1]; |
| } |
| |
| } |
| |
| return new String[] { result, followSent }; |
| } |
| |
| public String[] extractSentencesFromPage(String downloadedPage) |
| { |
| |
| int maxSentsFromPage= 100; |
| List<String[]> results = new ArrayList<>(); |
| |
| //String pageOrigHTML = pFetcher.fetchOrigHTML(url); |
| |
| downloadedPage= downloadedPage.replace(" ", "&"); |
| downloadedPage = downloadedPage.replaceAll("(?:&)+", "#"); |
| String[] sents = downloadedPage.split("#"); |
| List<TextChunk> sentsList = new ArrayList<>(); |
| for (String s: sents) { |
| s = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(s); |
| sentsList.add(new TextChunk(s, s.length())); |
| } |
| |
| sentsList.sort(new TextChunkComparable()); |
| String[] longestSents = new String[maxSentsFromPage]; |
| int j=0; |
| int initIndex = sentsList.size()-1 -maxSentsFromPage; |
| if (initIndex<0) |
| initIndex = 0; |
| for (int i=initIndex; i< sentsList.size() && j<maxSentsFromPage ; i++) { |
| longestSents[j] = sentsList.get(i).text; |
| j++; |
| } |
| |
| sents = cleanSplitListOfSents(longestSents); |
| |
| //sents = removeDuplicates(sents); |
| //sents = verifyEnforceStartsUpperCase(sents); |
| |
| return sents; |
| } |
| |
| public static class TextChunk { |
| public TextChunk(String s, int length) { |
| this.text = s; |
| this.len = length; |
| } |
| public final String text; |
| public final int len; |
| } |
| |
| public static class TextChunkComparable implements Comparator<TextChunk> { |
| |
| @Override |
| public int compare(TextChunk ch1, TextChunk ch2) { |
| return Integer.compare(ch1.len, ch2.len); |
| } |
| } |
| |
| protected String[] cleanSplitListOfSents(String[] longestSents){ |
| float minFragmentLength = 40, minFragmentLengthSpace=4; |
| |
| List<String> sentsClean = new ArrayList<>(); |
| for (String sentenceOrMultSent : longestSents) { |
| if (sentenceOrMultSent==null || sentenceOrMultSent.length()<20) |
| continue; |
| if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null) { |
| LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", |
| sentenceOrMultSent); |
| continue; |
| } |
| // aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n. |
| int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length; |
| float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLength) |
| continue; |
| // o oo o ooo o o o ooo oo ooo o o oo |
| numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length; |
| avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace) |
| continue; |
| |
| List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent); |
| |
| // forced split by ',' somewhere in the middle of sentence |
| // disused - Feb 26 13 |
| //furtherSplit = furtherMakeSentencesShorter(furtherSplit); |
| furtherSplit.remove(furtherSplit.size()-1); |
| for(String s : furtherSplit) { |
| if (s.indexOf('|')>-1) |
| continue; |
| s = s.replace("<em>"," ").replace("</em>"," "); |
| s = Utils.convertToASCII(s); |
| sentsClean.add(s); |
| } |
| } |
| return sentsClean.toArray(new String[0]); |
| } |
| |
| protected String[] cleanSplitListOfSentsFirstSplit(String[] longestSents){ |
| float minFragmentLength = 40, minFragmentLengthSpace=4; |
| |
| List<String> sentsClean = new ArrayList<>(); |
| for (String sentenceOrMultSent : longestSents) { |
| if (sentenceOrMultSent==null || sentenceOrMultSent.length()<minFragmentLength) |
| continue; |
| List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent); |
| for(String sentence: furtherSplit ) { |
| if (sentence==null || sentence.length()<20) |
| continue; |
| if (GeneratedSentenceProcessor.acceptableMinedSentence(sentence)==null){ |
| //System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent); |
| continue; |
| } |
| // aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n. |
| int numOfDots = sentence.replace('.','&').split("&").length; |
| float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLength) |
| continue; |
| // o oo o ooo o o o ooo oo ooo o o oo |
| numOfDots = sentence.replace(' ','&').split("&").length; |
| avgSentenceLengthInTextPortion = (float)sentence.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace) |
| continue; |
| |
| // forced split by ',' somewhere in the middle of sentence |
| // disused - Feb 26 13 |
| //furtherSplit = furtherMakeSentencesShorter(furtherSplit); |
| //furtherSplit.remove(furtherSplit.size()-1); |
| |
| if (sentence.indexOf('|')>-1) |
| continue; |
| sentence = Utils.convertToASCII(sentence); |
| sentsClean.add(sentence); |
| } |
| } |
| return sentsClean.toArray(new String[0]); |
| } |
| |
| public static String getPortionOfTitleWithoutDelimiters(String title){ |
| String[] delimiters = new String[]{"\\+","-", "=", "_", "\\)", "\\|"}; |
| for(String delim: delimiters ){ |
| String[] split = title.split(delim); |
| if (split.length>1){ |
| for(String s: split){ |
| if (!s.contains(".")) |
| return s; |
| } |
| } |
| } |
| |
| return title; |
| } |
| |
| public static void main(String[] args){ |
| String s = "You can grouP parts Of your regular expression In your pattern You grouP elements"; |
| //with round brackets, e.g., ()." + |
| // " This allows you to assign a repetition operator to a complete group."; |
| String sr = s.replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3"); |
| String sr1 = s.replaceAll(" [A-Z]", ". $0"); |
| sr = s.replaceAll("[a-z] [A-Z]", ". $1"); |
| sr1 = s.replaceAll(" [A-Z]", ". $1"); |
| } |
| |
| public static boolean problematicHitList(List<HitBase> hits){ |
| if (hits.size()<1) |
| return true; |
| for(HitBase hit: hits){ |
| if (!hit.getFragments().isEmpty()) |
| return false; |
| } |
| return true; |
| } |
| } |
| |
| |
| |