blob: 94960e5cb36b11cbe8fedbf4a703f9669ee64b28 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.apps;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.textsimilarity.TextProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WebPageExtractor {
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
protected final PageFetcher pageFetcher = new PageFetcher();
protected final MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();
protected static final int SENT_THRESHOLD_LENGTH = 70;
public List<String[]> extractSentencesWithPotentialProductKeywords(String url) {
int maxSentsFromPage= 20;
List<String[]> results = new ArrayList<>();
String downloadedPage = pageFetcher.fetchPage(url, 20000);
if (downloadedPage == null || downloadedPage.length() < 100)
{
return null;
}
String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );
pageTitle = pageTitle.replace(" ", ". ").replace("..", ".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ". ").replace(" |", ". ").
replace (". .",".").trim();
List<String> pageTitles = new ArrayList<>();
pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));
pageTitles.addAll(Arrays.asList(pageTitle.split(".")));
String[] headerSections = pageOrigHTML.split("<h2");
if (headerSections.length<2)
headerSections = pageOrigHTML.split("<h3");
for(String section: headerSections){
String header = StringUtils.substringBetween(section, ">", "<");
if (header!=null && header.length()>20)
pageTitles.add(header);
}
downloadedPage= downloadedPage.replace(" ", "&");
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
List<TextChunk> sentsList = new ArrayList<>();
for(String s: sents){
s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ". ").
replace (". .",".").trim();
sentsList.add(new TextChunk(s, s.length()));
}
sentsList.sort(new TextChunkComparable());
String[] longestSents = new String[maxSentsFromPage];
int j=0;
for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){
longestSents[j] = sentsList.get(i).text;
j++;
}
sents = cleanListOfSents(longestSents);
List<String> mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);
// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);
results.add(pageTitles.toArray(new String[0]));
results.add(mosFrequentWordsListFromPage.toArray(new String[0]));
results.add(sents);
return results;
}
protected String[] cleanListOfSents(String[] longestSents) {
List<String> sentsClean = new ArrayList<>();
for (String sentenceOrMultSent : longestSents) {
List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
for(String s : furtherSplit) {
if (s.replace('.','&').split("&").length>3)
continue;
if (s.indexOf('|')>-1)
continue;
if (s == null || s.trim().length() < SENT_THRESHOLD_LENGTH || s.length() < SENT_THRESHOLD_LENGTH + 10)
continue;
if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null) {
LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", s);
continue;
}
sentsClean.add(s);
}
}
return sentsClean.toArray(new String[0]);
}
public static class TextChunk {
public TextChunk(String s, int length) {
this.text = s;
this.len = length;
}
public final String text;
public final int len;
}
public static class TextChunkComparable implements Comparator<TextChunk> {
@Override
public int compare(TextChunk ch1, TextChunk ch2) {
return Integer.compare(ch1.len, ch2.len);
}
}
public static void main(String[] args){
WebPageExtractor extractor = new WebPageExtractor();
List<String[]> res = extractor.extractSentencesWithPotentialProductKeywords(
"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
LOG.info(Arrays.toString(res.get(1)));
}
}