/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.parse_thicket.apps; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.List; | |
import org.apache.commons.lang.StringUtils; | |
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor; | |
import opennlp.tools.similarity.apps.utils.PageFetcher; | |
import opennlp.tools.textsimilarity.TextProcessor; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
public class WebPageExtractor | |
{ | |
protected PageFetcher pageFetcher = new PageFetcher(); | |
protected ParserChunker2MatcherProcessor nlProc; | |
protected MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter(); | |
protected static int sentThresholdLength = 70; | |
public List<String[]> extractSentencesWithPotentialProductKeywords(String url) | |
{ | |
int maxSentsFromPage= 20; | |
List<String[]> results = new ArrayList<String[]>(); | |
String downloadedPage = pageFetcher.fetchPage(url, 20000); | |
if (downloadedPage == null || downloadedPage.length() < 100) | |
{ | |
return null; | |
} | |
String pageOrigHTML = pageFetcher.fetchOrigHTML(url); | |
String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" ); | |
pageTitle = pageTitle.replace(" ", ". ").replace("..", ".").replace(". . .", " ") | |
.replace(": ", ". ").replace("- ", ". ").replace(" |", ". "). | |
replace (". .",".").trim(); | |
List<String> pageTitles = new ArrayList<String>(); | |
pageTitles.addAll(TextProcessor.splitToSentences(pageTitle)); | |
pageTitles.addAll(Arrays.asList(pageTitle.split("."))); | |
String[] headerSections = pageOrigHTML.split("<h2"); | |
if (headerSections.length<2) | |
headerSections = pageOrigHTML.split("<h3"); | |
for(String section: headerSections){ | |
String header = StringUtils.substringBetween(section, ">", "<"); | |
if (header!=null && header.length()>20) | |
pageTitles.add(header); | |
} | |
downloadedPage= downloadedPage.replace(" ", "&"); | |
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#"); | |
String[] sents = downloadedPage.split("#"); | |
List<TextChunk> sentsList = new ArrayList<TextChunk>(); | |
for(String s: sents){ | |
s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ") | |
.replace(": ", ". ").replace("- ", ". "). | |
replace (". .",".").trim(); | |
sentsList.add(new TextChunk(s, s.length())); | |
} | |
Collections.sort(sentsList, new TextChunkComparable()); | |
String[] longestSents = new String[maxSentsFromPage]; | |
int j=0; | |
for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){ | |
longestSents[j] = sentsList.get(i).text; | |
j++; | |
} | |
sents = cleanListOfSents(longestSents); | |
List<String> mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents); | |
// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage); | |
results.add(pageTitles.toArray(new String[0])); | |
results.add(mosFrequentWordsListFromPage.toArray(new String[0])); | |
results.add(sents); | |
return results; | |
} | |
protected String[] cleanListOfSents(String[] longestSents) | |
{ | |
List<String> sentsClean = new ArrayList<String>(); | |
for (String sentenceOrMultSent : longestSents) | |
{ | |
List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent); | |
for(String s : furtherSplit){ | |
if (s.replace('.','&').split("&").length>3) | |
continue; | |
if (s.indexOf('|')>-1) | |
continue; | |
if (s == null || s.trim().length() < sentThresholdLength || s.length() < sentThresholdLength + 10) | |
continue; | |
if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null){ | |
System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+s); | |
continue; | |
} | |
sentsClean.add(s); | |
} | |
} | |
return (String[]) sentsClean.toArray(new String[0]); | |
} | |
public class TextChunk { | |
public TextChunk(String s, int length) { | |
this.text = s; | |
this.len = length; | |
} | |
public String text; | |
public int len; | |
} | |
public class TextChunkComparable implements Comparator<TextChunk> | |
{ | |
public int compare(TextChunk ch1, TextChunk ch2) | |
{ | |
if (ch1.len>ch2.len) | |
return 1; | |
else if (ch1.len<ch2.len) | |
return -1; | |
else return 0; | |
} | |
} | |
public static void main(String[] args){ | |
WebPageExtractor extractor = new WebPageExtractor(); | |
List<String[]> res = | |
extractor.extractSentencesWithPotentialProductKeywords("http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/"); | |
System.out.println(res.get(1)); | |
} | |
} |