opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/WebPageExtractor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.parse_thicket.apps;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.commons.lang.StringUtils;

 import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
 import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.textsimilarity.TextProcessor;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 public class WebPageExtractor
 {
 	protected PageFetcher pageFetcher = new PageFetcher();

 	protected ParserChunker2MatcherProcessor nlProc;
 	protected MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();

 	protected static int sentThresholdLength = 70;

 	public List<String[]> extractSentencesWithPotentialProductKeywords(String url)
 	{
 		int maxSentsFromPage= 20;
 		List<String[]> results = new ArrayList<String[]>();

 		String downloadedPage = pageFetcher.fetchPage(url, 20000);
 		if (downloadedPage == null || downloadedPage.length() < 100)
 		{
 			return null;
 		}

 		String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
 		String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );
 		pageTitle = pageTitle.replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 				.replace(": ", ". ").replace("- ", ". ").replace(" |", ". ").
 				replace (". .",".").trim();
 		List<String> pageTitles = new ArrayList<String>();
 		pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));
 		pageTitles.addAll(Arrays.asList(pageTitle.split(".")));

 		String[] headerSections = pageOrigHTML.split("<h2");
 		if (headerSections.length<2)
 			headerSections = pageOrigHTML.split("<h3");
 		for(String section: headerSections){

 			String header = StringUtils.substringBetween(section, ">", "<");
 			if (header!=null && header.length()>20)
 				pageTitles.add(header);
 		}


 		downloadedPage= downloadedPage.replace("     ", "&");
 		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
 		String[] sents = downloadedPage.split("#");
 		List<TextChunk> sentsList = new ArrayList<TextChunk>();
 		for(String s: sents){
 			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
 					.replace(": ", ". ").replace("- ", ". ").
 					replace (". .",".").trim();
 			sentsList.add(new TextChunk(s, s.length()));
 		}

 		Collections.sort(sentsList, new TextChunkComparable());


 		String[] longestSents = new String[maxSentsFromPage];
 		int j=0;
 		for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){
 			longestSents[j] = sentsList.get(i).text;
 			j++;
 		}

 		sents = cleanListOfSents(longestSents);

 		List<String>  mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);
 		// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);

 		results.add(pageTitles.toArray(new String[0]));
 		results.add(mosFrequentWordsListFromPage.toArray(new String[0]));
 		results.add(sents);

 		return results;
 	}

 	protected String[] cleanListOfSents(String[] longestSents)
 	{
 		List<String> sentsClean = new ArrayList<String>();
 		for (String sentenceOrMultSent : longestSents)
 		{
 			List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
 			for(String s : furtherSplit){
 				if (s.replace('.','&').split("&").length>3)
 					continue;
 				if (s.indexOf('|')>-1)
 					continue;
 				if (s == null || s.trim().length() < sentThresholdLength || s.length() < sentThresholdLength + 10)
 					continue;
 				if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null){
 					System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+s);
 					continue;
 				}
 				sentsClean.add(s);
 			}
 		}
 		return (String[]) sentsClean.toArray(new String[0]);
 	}

 	public class TextChunk {
 		public TextChunk(String s, int length) {
 			this.text = s;
 			this.len = length;
 		}
 		public String text;
 		public int len;
 	}

 	public class TextChunkComparable implements Comparator<TextChunk>
 	{
 		public int compare(TextChunk ch1, TextChunk ch2)
 		{
 			if (ch1.len>ch2.len)
 				return 1;
 			else if (ch1.len<ch2.len)
 				return  -1;
 			else return 0;

 		}
 	}

 	public static void main(String[] args){
 		WebPageExtractor extractor = new WebPageExtractor();
 		List<String[]> res =
 				extractor.extractSentencesWithPotentialProductKeywords("http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
 		System.out.println(res.get(1));

 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.parse_thicket.apps;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.commons.lang.StringUtils;

	import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
	import opennlp.tools.similarity.apps.utils.PageFetcher;
	import opennlp.tools.textsimilarity.TextProcessor;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	public class WebPageExtractor
	{
	protected PageFetcher pageFetcher = new PageFetcher();

	protected ParserChunker2MatcherProcessor nlProc;
	protected MostFrequentWordsFromPageGetter mostFrequentWordsFromPageGetter = new MostFrequentWordsFromPageGetter();

	protected static int sentThresholdLength = 70;

	public List<String[]> extractSentencesWithPotentialProductKeywords(String url)
	{
	int maxSentsFromPage= 20;
	List<String[]> results = new ArrayList<String[]>();

	String downloadedPage = pageFetcher.fetchPage(url, 20000);
	if (downloadedPage == null \|\| downloadedPage.length() < 100)
	{
	return null;
	}

	String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
	String pageTitle = StringUtils.substringBetween(pageOrigHTML, "<title>", "</title>" );
	pageTitle = pageTitle.replace(" ", ". ").replace("..", ".").replace(". . .", " ")
	.replace(": ", ". ").replace("- ", ". ").replace(" \|", ". ").
	replace (". .",".").trim();
	List<String> pageTitles = new ArrayList<String>();
	pageTitles.addAll(TextProcessor.splitToSentences(pageTitle));
	pageTitles.addAll(Arrays.asList(pageTitle.split(".")));

	String[] headerSections = pageOrigHTML.split("<h2");
	if (headerSections.length<2)
	headerSections = pageOrigHTML.split("<h3");
	for(String section: headerSections){

	String header = StringUtils.substringBetween(section, ">", "<");
	if (header!=null && header.length()>20)
	pageTitles.add(header);
	}


	downloadedPage= downloadedPage.replace(" ", "&");
	downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
	String[] sents = downloadedPage.split("#");
	List<TextChunk> sentsList = new ArrayList<TextChunk>();
	for(String s: sents){
	s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
	.replace(": ", ". ").replace("- ", ". ").
	replace (". .",".").trim();
	sentsList.add(new TextChunk(s, s.length()));
	}

	Collections.sort(sentsList, new TextChunkComparable());


	String[] longestSents = new String[maxSentsFromPage];
	int j=0;
	for(int i=sentsList.size() -maxSentsFromPage; i< sentsList.size(); i++){
	longestSents[j] = sentsList.get(i).text;
	j++;
	}

	sents = cleanListOfSents(longestSents);

	List<String> mosFrequentWordsListFromPage = mostFrequentWordsFromPageGetter. getMostFrequentWordsInTextArr(sents);
	// mostFrequentWordsFromPageGetter. getMostFrequentWordsInText(downloadedPage);

	results.add(pageTitles.toArray(new String[0]));
	results.add(mosFrequentWordsListFromPage.toArray(new String[0]));
	results.add(sents);

	return results;
	}

	protected String[] cleanListOfSents(String[] longestSents)
	{
	List<String> sentsClean = new ArrayList<String>();
	for (String sentenceOrMultSent : longestSents)
	{
	List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent);
	for(String s : furtherSplit){
	if (s.replace('.','&').split("&").length>3)
	continue;
	if (s.indexOf('\|')>-1)
	continue;
	if (s == null \|\| s.trim().length() < sentThresholdLength \|\| s.length() < sentThresholdLength + 10)
	continue;
	if (GeneratedSentenceProcessor.acceptableMinedSentence(s)==null){
	System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+s);
	continue;
	}
	sentsClean.add(s);
	}
	}
	return (String[]) sentsClean.toArray(new String[0]);
	}

	public class TextChunk {
	public TextChunk(String s, int length) {
	this.text = s;
	this.len = length;
	}
	public String text;
	public int len;
	}

	public class TextChunkComparable implements Comparator<TextChunk>
	{
	public int compare(TextChunk ch1, TextChunk ch2)
	{
	if (ch1.len>ch2.len)
	return 1;
	else if (ch1.len<ch2.len)
	return -1;
	else return 0;

	}
	}

	public static void main(String[] args){
	WebPageExtractor extractor = new WebPageExtractor();
	List<String[]> res =
	extractor.extractSentencesWithPotentialProductKeywords("http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/");
	System.out.println(res.get(1));

	}

	}