opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.apps;

 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.commons.lang.StringUtils;

 import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.textsimilarity.TextProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class WebPageExtractor {

 	private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

 	protected final PageFetcher pageFetcher = new PageFetcher();

 	protected final MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();

 	protected static final int SENT_THRESHOLD_LENGTH = 70;

 	public List<String[]> extractSentencesWithPotentialProductKeywords(String url) {
 		int maxSentsFromPage= 20;
 		List<String[]> results = new ArrayList<>();

 		String downloadedPage = pageFetcher.fetchPage(url, 20000);
 		if (downloadedPage == null || downloadedPage.length() < 100)
 		{
 			return null;
 		}

 		String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
 		String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );
 		pageTitle = pageTitle.replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 				.replace(": ", ". ").replace("- ", ". ").replace(" |", ". ").
 				replace (". .",".").trim();
 		List<String> pageTitles = new ArrayList<>();
 		pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));
 		pageTitles.addAll(Arrays.asList(pageTitle.split(".")));

 		String[] headerSections = pageOrigHTML.split("<h2");
 		if (headerSections.length<2)
 			headerSections = pageOrigHTML.split("<h3");
 		for(String section: headerSections){
 			String header = StringUtils.substringBetween(section, ">", "<");
 			if (header!=null && header.length()>20)
 				pageTitles.add(header);
 		}

 		downloadedPage= downloadedPage.replace("     ", "&");
 		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
 		String[] sents = downloadedPage.split("#");
 		List<TextChunk> sentsList = new ArrayList<>();
 		for(String s: sents){
 			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 					.replace(": ", ". ").replace("- ", ". ").
 					replace (". .",".").trim();
 			sentsList.add(new TextChunk(s, s.length()));
 		}
 		sentsList.sort(new TextChunkComparable());

 		String[] longestSents = new String[maxSentsFromPage];
 		int j=0;
 		for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){
 			longestSents[j] = sentsList.get(i).text;
 			j++;
 		}

 		sents = cleanListOfSents(longestSents);

 		List<String>  mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);
 		// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);

 		results.add(pageTitles.toArray(new String[0]));
 		results.add(mosFrequentWordsListFromPage.toArray(new String[0]));
 		results.add(sents);

 		return results;
 	}

 	protected String[] cleanListOfSents(String[] longestSents) {
 		List<String> sentsClean = new ArrayList<>();
 		for (String sentenceOrMultSent : longestSents) {
 			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
 			for(String s : furtherSplit) {
 				if (s.replace('.','&').split("&").length>3)
 					continue;
 				if (s.indexOf('|')>-1)
 					continue;
 				if (s == null || s.trim().length() < SENT_THRESHOLD_LENGTH || s.length() < SENT_THRESHOLD_LENGTH + 10)
 					continue;
 				if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null) {
 					LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", s);
 					continue;
 				}
 				sentsClean.add(s);
 			}
 		}
 		return sentsClean.toArray(new String[0]);
 	}

 	public static class TextChunk {
 		public TextChunk(String s, int length) {
 			this.text = s;
 			this.len = length;
 		}
 		public final String text;
 		public final int len;
 	}

 	public static class TextChunkComparable implements Comparator<TextChunk> {

 		@Override
 		public int compare(TextChunk ch1, TextChunk ch2) {
 			return Integer.compare(ch1.len, ch2.len);

 		}
 	}

 	public static void main(String[] args){
 		WebPageExtractor extractor = new WebPageExtractor();
 		List<String[]> res = extractor.extractSentencesWithPotentialProductKeywords(
 						"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
 		LOG.info(Arrays.toString(res.get(1)));
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.apps;

	import java.lang.invoke.MethodHandles;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.commons.lang.StringUtils;

	import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.textsimilarity.TextProcessor;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class WebPageExtractor {

	private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

	protected final PageFetcher pageFetcher = new PageFetcher();

	protected final MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();

	protected static final int SENT_THRESHOLD_LENGTH = 70;

	public List<String[]> extractSentencesWithPotentialProductKeywords(String url) {
	int maxSentsFromPage= 20;
	List<String[]> results = new ArrayList<>();

	String downloadedPage = pageFetcher.fetchPage(url, 20000);
	if (downloadedPage == null \|\| downloadedPage.length() < 100)
	{
	return null;
	}

	String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
	String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );
	pageTitle = pageTitle.replace(" ", ". ").replace("..", ".").replace(". . .", " ")
	.replace(": ", ". ").replace("- ", ". ").replace(" \|", ". ").
	replace (". .",".").trim();
	List<String> pageTitles = new ArrayList<>();
	pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));
	pageTitles.addAll(Arrays.asList(pageTitle.split(".")));

	String[] headerSections = pageOrigHTML.split("<h2");
	if (headerSections.length<2)
	headerSections = pageOrigHTML.split("<h3");
	for(String section: headerSections){
	String header = StringUtils.substringBetween(section, ">", "<");
	if (header!=null && header.length()>20)
	pageTitles.add(header);
	}

	downloadedPage= downloadedPage.replace(" ", "&");
	downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
	String[] sents = downloadedPage.split("#");
	List<TextChunk> sentsList = new ArrayList<>();
	for(String s: sents){
	s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
	.replace(": ", ". ").replace("- ", ". ").
	replace (". .",".").trim();
	sentsList.add(new TextChunk(s, s.length()));
	}
	sentsList.sort(new TextChunkComparable());

	String[] longestSents = new String[maxSentsFromPage];
	int j=0;
	for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){
	longestSents[j] = sentsList.get(i).text;
	j++;
	}

	sents = cleanListOfSents(longestSents);

	List<String> mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);
	// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);

	results.add(pageTitles.toArray(new String[0]));
	results.add(mosFrequentWordsListFromPage.toArray(new String[0]));
	results.add(sents);

	return results;
	}

	protected String[] cleanListOfSents(String[] longestSents) {
	List<String> sentsClean = new ArrayList<>();
	for (String sentenceOrMultSent : longestSents) {
	List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
	for(String s : furtherSplit) {
	if (s.replace('.','&').split("&").length>3)
	continue;
	if (s.indexOf('\|')>-1)
	continue;
	if (s == null \|\| s.trim().length() < SENT_THRESHOLD_LENGTH \|\| s.length() < SENT_THRESHOLD_LENGTH + 10)
	continue;
	if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null) {
	LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", s);
	continue;
	}
	sentsClean.add(s);
	}
	}
	return sentsClean.toArray(new String[0]);
	}

	public static class TextChunk {
	public TextChunk(String s, int length) {
	this.text = s;
	this.len = length;
	}
	public final String text;
	public final int len;
	}

	public static class TextChunkComparable implements Comparator<TextChunk> {

	@Override
	public int compare(TextChunk ch1, TextChunk ch2) {
	return Integer.compare(ch1.len, ch2.len);

	}
	}

	public static void main(String[] args){
	WebPageExtractor extractor = new WebPageExtractor();
	List<String[]> res = extractor.extractSentencesWithPotentialProductKeywords(
	"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
	LOG.info(Arrays.toString(res.get(1)));
	}

	}