blob: 038fcfc2f77d5c5dae381203d930e15a3ad211f3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.apps;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import opennlp.tools.similarity.apps.GeneratedSentenceProcessor;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.TextProcessor;
import org.apache.commons.lang.StringUtils;
public class WebPageContentSentenceExtractor extends WebPageExtractor {
public List<String> extractSentencesWithPotentialReviewPhrases(String url)
{
int maxSentsFromPage = 30;
String downloadedPage = pageFetcher.fetchPage(url, 20000);
if (downloadedPage == null || downloadedPage.length() < 100)
{
return null;
}
downloadedPage= downloadedPage.replace(" ", "&");
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
List<TextChunk> sentsList = new ArrayList<TextChunk>();
for(String s: sents){
s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ". ").
replace (". .",".").trim();
sentsList.add(new TextChunk(s, s.length()));
}
Collections.sort(sentsList, new TextChunkComparable());
String[] longestSents = new String[maxSentsFromPage];
int j=0; // -1 removed
for(int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()-1; i++){
longestSents[j] = sentsList.get(i).text;
j++;
}
sents = cleanListOfSents(longestSents);
/*
for(int i = 0; i< sents.length; i++){
sents[i] = sents[i].trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ". ").
replace (". .",".").trim();
}
sents = cleanListOfSents(sents);
*/ sents = verifyEnforceStartsUpperCase(sents);
return Arrays.asList(sents);
}
private String[] verifyEnforceStartsUpperCase(String[] sents) {
for(int i=0; i<sents.length; i++){
String s = sents[i];
s = StringUtils.trim(s);
String sFirstChar = s.substring(0, 1);
if (!sFirstChar.toUpperCase().equals(sFirstChar)){
s = sFirstChar.toUpperCase()+s.substring(1);
}
sents[i] = s;
}
return sents;
}
private List<String> cleanProductFeatures(List<String> productFeaturesList) {
List<String> results = new ArrayList<String>();
for(String feature: productFeaturesList){
if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0)
continue;
results.add(feature);
}
return results;
}
// extracts paragraphs from web page
protected String[] cleanListOfSents(String[] longestSents)
{
float minFragmentLength = 40, minFragmentLengthSpace=4;
List<String> sentsClean = new ArrayList<String>();
for (String sentenceOrMultSent : longestSents)
{
if (GeneratedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){
System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent);
continue;
}
// aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n.
int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length;
float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
if ( avgSentenceLengthInTextPortion<minFragmentLength)
continue;
// o oo o ooo o o o ooo oo ooo o o oo
numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length;
avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots;
if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace)
continue;
sentsClean.add(sentenceOrMultSent);
}
return (String[]) sentsClean.toArray(new String[0]);
}
private String startWithCapitalSent(String sent) {
String firstChar = sent.substring(0,1);
String remainder = sent.substring(1);
return firstChar.toUpperCase()+remainder;
}
public HitBase formTextFromOriginalPageGivenSnippet(HitBase hit) {
List<String> results = extractSentencesWithPotentialReviewPhrases(hit.getUrl());
hit.setOriginalSentences(results);
return hit;
}
}