| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.apps.review_builder; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| |
| import opennlp.tools.jsmlearning.ProfileReaderWriter; |
| import opennlp.tools.parse_thicket.apps.WebPageExtractor; |
| import opennlp.tools.similarity.apps.HitBase; |
| import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; |
| import opennlp.tools.similarity.apps.utils.Utils; |
| import opennlp.tools.textsimilarity.TextProcessor; |
| import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| public class WebPageReviewExtractor extends WebPageExtractor { |
| |
| private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private final BingAPIProductSearchManager prodman = new BingAPIProductSearchManager(); |
| private final SentenceOriginalizer orig; |
| |
| public WebPageReviewExtractor(String resourceDir) { |
| orig = new SentenceOriginalizer(resourceDir); |
| } |
| |
| public String[] removeDuplicates(String[] hits) { |
| StringDistanceMeasurer meas = new StringDistanceMeasurer(); |
| |
| List<Integer> idsToRemove = new ArrayList<>(); |
| List<String> hitsDedup = new ArrayList<>(); |
| try { |
| for (int i = 0; i < hits.length; i++) |
| for (int j = i + 1; j < hits.length; j++) { |
| String title1 = hits[i]; |
| String title2 = hits[j]; |
| if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2)) |
| continue; |
| if (meas.measureStringDistance(title1, title2) > 0.7) { |
| idsToRemove.add(j); // dupes found, later list member to be deleted |
| } |
| } |
| for (int i = 0; i < hits.length; i++) |
| if (!idsToRemove.contains(i)) |
| hitsDedup.add(hits[i]); |
| if (hitsDedup.size() < hits.length) { |
| System.out.println("Removed duplicates from relevant search results, including " |
| + hits[idsToRemove.get(0)]); |
| } |
| } |
| catch (Exception e) { |
| System.out.println("Problem removing duplicates from relevant images"); |
| } |
| |
| return hitsDedup.toArray(new String[0]); |
| |
| } |
| |
| public ReviewObj extractSentencesWithPotentialReviewPhrases(String url) { |
| ReviewObj reviewObj = new ReviewObj(); |
| int maxSentsFromPage= 20; |
| |
| String downloadedPage = pageFetcher.fetchPage(url, 20000); |
| if (downloadedPage == null || downloadedPage.length() < 100) |
| { |
| return null; |
| } |
| |
| String pageOrigHTML = pageFetcher.fetchOrigHTML(url); |
| |
| List<String> productFeaturesList = new ArrayList<>(); |
| String[] productFeatures = StringUtils.substringsBetween(pageOrigHTML, "<li>", "</li>" ); |
| if (productFeatures!=null){ |
| for(String item: productFeatures ){ |
| if (item.contains("class") || item.contains("www.") || item.contains("href")) |
| continue; |
| item = item.replace("<span>","").replace("</span>","").replace("<b>","").replace("</b>",""); |
| if (item.length()>80 && MinedSentenceProcessor.acceptableMinedSentence(item)==null){ |
| LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", item); |
| continue; |
| } |
| productFeaturesList .add(item); |
| } |
| } |
| |
| productFeaturesList = cleanProductFeatures(productFeaturesList); |
| |
| String startArea = StringUtils.substringBetween(pageOrigHTML, "reviewHistoPop", "t of 5 stars"); |
| String item = StringUtils.substringBetween(startArea, "title=\"","ou" ); |
| if (item==null) { //title="4.0 out of 5 stars" ><span>4.0 out of 5 stars</span> |
| int index = pageOrigHTML.indexOf("of 5 stars\""); |
| startArea = StringUtils.substringBetween(pageOrigHTML, "of 5 stars\"", "of 5 stars"); |
| item = StringUtils.substringBetween(startArea, "<span>","ou" ); |
| } |
| |
| // if found, process |
| if (item!=null){ |
| try { |
| float rating = Float.parseFloat(item); |
| reviewObj.setRating(rating); |
| } catch (NumberFormatException e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| } |
| //productFeaturesList .add(item); |
| |
| downloadedPage= downloadedPage.replace(" ", "&"); |
| downloadedPage = downloadedPage.replaceAll("(?:&)+", "#"); |
| String[] sents = downloadedPage.split("#"); |
| List<TextChunk> sentsList = new ArrayList<>(); |
| for(String s: sents) { |
| s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ") |
| .replace(": ", ". ").replace("- ", ". "). |
| replace (". .",".").trim(); |
| sentsList.add(new TextChunk(s, s.length())); |
| } |
| |
| sentsList.sort(new TextChunkComparable()); |
| String[] longestSents = new String[maxSentsFromPage]; |
| int j=0; // -1 removed |
| for (int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()&& j<longestSents.length; i++) { |
| longestSents[j] = sentsList.get(i).text; |
| j++; |
| } |
| |
| sents = cleanListOfSents(longestSents); |
| sents = removeDuplicates(sents); |
| sents = verifyEnforceStartsUpperCase(sents); |
| |
| reviewObj.setFeaturePhrases(productFeaturesList.toArray(new String[0])); |
| reviewObj.setOrigSentences(sents); |
| |
| return reviewObj; |
| } |
| |
| private String[] verifyEnforceStartsUpperCase(String[] sents) { |
| for (int i=0; i<sents.length; i++) { |
| String s = sents[i]; |
| s = StringUtils.trim(s); |
| String sFirstChar = s.substring(0, 1); |
| if (!sFirstChar.toUpperCase().equals(sFirstChar)){ |
| s = sFirstChar.toUpperCase()+s.substring(1); |
| } |
| sents[i] = s; |
| } |
| return sents; |
| } |
| |
| private List<String> cleanProductFeatures(List<String> productFeaturesList) { |
| List<String> results = new ArrayList<>(); |
| for (String feature: productFeaturesList) { |
| if (feature.startsWith("Unlimited Free") || feature.startsWith("View Larger") || feature.startsWith("View Larger") || feature.indexOf("shipping")>0) |
| continue; |
| results.add(feature); |
| } |
| return results; |
| } |
| |
| protected String[] cleanListOfSents(String[] longestSents) { |
| float minFragmentLength = 40, minFragmentLengthSpace=4; |
| |
| List<String> sentsClean = new ArrayList<>(); |
| for (String sentenceOrMultSent : longestSents) { |
| if (MinedSentenceProcessor.acceptableMinedSentence(sentenceOrMultSent)==null){ |
| // System.out.println("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = "+sentenceOrMultSent); |
| continue; |
| } |
| // aaa. hhh hhh. kkk . kkk ll hhh. lll kkk n. |
| int numOfDots = sentenceOrMultSent.replace('.','&').split("&").length; |
| float avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLength) |
| continue; |
| // o oo o ooo o o o ooo oo ooo o o oo |
| numOfDots = sentenceOrMultSent.replace(' ','&').split("&").length; |
| avgSentenceLengthInTextPortion = (float)sentenceOrMultSent.length() /(float) numOfDots; |
| if ( avgSentenceLengthInTextPortion<minFragmentLengthSpace) |
| continue; |
| |
| List<String> furtherSplit = TextProcessor.splitToSentences(sentenceOrMultSent); |
| |
| // forced split by ',' somewhere in the middle of sentence |
| // disused - Feb 26 13 |
| //furtherSplit = furtherMakeSentencesShorter(furtherSplit); |
| furtherSplit.remove(furtherSplit.size()-1); |
| for (String s : furtherSplit) { |
| if (s.indexOf('|')>-1) |
| continue; |
| s = s.replace("<em>"," ").replace("</em>"," "); |
| s = Utils.convertToASCII(s); |
| sentsClean.add(s); |
| } |
| } |
| |
| return sentsClean.toArray(new String[0]); |
| } |
| |
| private List<String> furtherMakeSentencesShorter(List<String> furtherSplit) { |
| int MIN_LENGTH_TO_SPLIT = 80; |
| List<String> results = new ArrayList<>(); |
| for(String sent: furtherSplit) { |
| sent = startWithCapitalSent(sent); |
| int len = sent.length(); |
| if (len <MIN_LENGTH_TO_SPLIT) |
| results.add(sent); |
| else { |
| try { |
| int commaIndex = StringUtils.indexOf(sent, ','); |
| int lastCommaIndex = StringUtils.lastIndexOf(sent, ','); |
| int splitIndex; |
| if (Math.abs(commaIndex- len/2) > Math.abs(lastCommaIndex- len/2)) |
| splitIndex = commaIndex; |
| else |
| splitIndex = lastCommaIndex; |
| if (splitIndex<0) |
| results.add(sent); |
| else { |
| String sent1 = sent.substring(0, splitIndex)+". "; |
| String sent2 = startWithCapitalSent(sent.substring(splitIndex+1)); |
| results.add(sent1); results.add(sent2); |
| } |
| } catch (Exception e) { |
| results.add(sent); |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| |
| } |
| } |
| return results; |
| } |
| |
| private String startWithCapitalSent(String sent) { |
| String firstChar = sent.substring(0,1); |
| String remainder = sent.substring(1); |
| |
| return firstChar.toUpperCase()+remainder; |
| } |
| |
| public List<String> formReviewsForAProduct(String name /*long bpid, String keywordsName*/){ |
| ReviewObj reviewObjTotal = null; |
| try { |
| List<HitBase> pagesForAProduct = prodman.findProductByName(name, 1); |
| reviewObjTotal = null; |
| |
| for(HitBase p: pagesForAProduct){ |
| ReviewObj reviewObj = |
| extractSentencesWithPotentialReviewPhrases(p.getUrl()); |
| // init with first element |
| if (reviewObjTotal == null) |
| reviewObjTotal = reviewObj; |
| if (reviewObj==null) |
| continue; |
| String[] afterOriginalization = orig.convert(reviewObj.getOrigSentences(), p.getTitle(), reviewObj.getKeywordsName()); |
| reviewObj.setOriginalizedSentences(Arrays.asList(afterOriginalization)); |
| reviewObj.setSentimentPhrases(orig.formedPhrases); |
| |
| List<String> buf = reviewObjTotal.getSentimentPhrases(); |
| if (orig.formedPhrases!=null && orig.formedPhrases.size()>0){ |
| buf.addAll(orig.formedPhrases); |
| reviewObjTotal.setSentimentPhrases(buf); |
| } |
| |
| /* |
| buf = reviewObjTotal.getOriginalizedSentences(); |
| if (buf!=null && afterOriginalization!=null && afterOriginalization.length>0){ |
| List<String> b1 = Arrays.asList(afterOriginalization); |
| List<String> b2 = new ArrayList<String>(); |
| b2.addAll(buf); |
| b2.addAll(new ArrayList<String>(b1)); |
| reviewObjTotal.setOriginalizedSentences(b2); |
| } |
| */ |
| } |
| if (reviewObjTotal==null) return new ArrayList<>(); |
| |
| List<String> textReviews = buildManyReviewTexts(reviewObjTotal); |
| |
| /* |
| String textReview = buildText(reviewObjTotal); |
| try { |
| if (textReview!=null && textReview.length()>60) |
| ser.saveReviewsToDB(textReview, bpid, pagesForAProduct.get(0).getUrl(), pagesForAProduct.get(0).getTitle(), |
| reviewObjTotal.getSentimentPhrases().toString(), reviewObjTotal.getRating()); |
| } catch (Exception e) { |
| System.out.println("Database write failed"); |
| } |
| */ |
| |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| return reviewObjTotal.getOriginalizedSentences(); |
| } |
| |
| private String buildText(ReviewObj reviewObj) { |
| |
| String[] features = reviewObj.getFeaturePhrases(); |
| List<String> sentences =reviewObj.getOriginalizedSentences(); |
| StringBuilder buf = new StringBuilder(); |
| int count = 0; |
| for(String sent:sentences){ |
| if (sent!=null) |
| buf.append(sent).append(" "); |
| if (count%2==0 && count<features.length) |
| if (features[count]!=null){ |
| buf.append(features[count]); |
| if (!(features[count].endsWith("!") ||features[count].endsWith("?")||features[count].endsWith("?") |
| ||features[count].endsWith(".\"") )) |
| buf.append(". "); |
| } |
| |
| if (count%5==0) |
| buf.append("\n"); |
| count++; |
| } |
| return buf.toString(); |
| } |
| |
| private List<String> buildManyReviewTexts(ReviewObj reviewObj) { |
| |
| String[] features = reviewObj.getFeaturePhrases(); |
| List<String> sentences =reviewObj.getOriginalizedSentences(); |
| |
| // first count how many sentences |
| int NUM_SENTS_IN_REVIEW = 7; |
| int count=0; |
| for(String sent:sentences){ |
| if (sent!=null) |
| count++; |
| } |
| int nReviews = count/NUM_SENTS_IN_REVIEW; |
| if (nReviews<1) |
| nReviews=1; |
| StringBuffer[] bufs = new StringBuffer[nReviews]; |
| for(int i=0; i<bufs.length; i++){ |
| bufs[i] = new StringBuffer(); |
| } |
| |
| count = 0; |
| int currentRevIndex = 0; |
| for(String sent:sentences){ |
| if (sent!=null) |
| bufs[currentRevIndex].append(sent).append(" "); |
| if (count%2==0 && count<features.length) |
| if (features[count]!=null){ |
| bufs[currentRevIndex].append(features[count]); |
| if (!(features[count].endsWith("!") ||features[count].endsWith("?")||features[count].endsWith("?") |
| ||features[count].endsWith(".\"") )) |
| bufs[currentRevIndex].append(". "); |
| } |
| |
| try { |
| if (bufs[currentRevIndex].toString().split(".").length>4) |
| bufs[currentRevIndex].append("\n"); |
| } catch (Exception e) { |
| LOG.error(e.getLocalizedMessage(), e); |
| } |
| |
| count++; |
| currentRevIndex++; |
| if (currentRevIndex>=nReviews) |
| currentRevIndex=0; |
| } |
| |
| List<String> results = new ArrayList<>(); |
| for(StringBuffer b:bufs){ |
| String sent = b.toString().replace("!.","!").replace("?.","?"); |
| results.add(sent); |
| } |
| return results; |
| } |
| |
| public static void main(String[] args){ |
| String resourceDir = "C:/stanford-corenlp/src/test/resources/"; |
| ParserChunker2MatcherProcessor proc = ParserChunker2MatcherProcessor.getInstance(resourceDir); |
| |
| //ProductFinderInAWebPage init = new ProductFinderInAWebPage("C:/workspace/relevanceEngine/src/test/resources"); |
| |
| WebPageReviewExtractor extractor = new WebPageReviewExtractor(resourceDir); |
| String[] res1 = extractor.verifyEnforceStartsUpperCase(new String[]{ "hhhh !", "Klyn mng hghj ."}); |
| |
| List<String> res = extractor.formReviewsForAProduct(//"McCulloch 16-Inch 3.5 HP Electric Chain Saw"); |
| // "WORX Electric JawSaw with Extension Handle"); |
| // "Panasonic 2-Line Digital Cordless System", 215200345l); |
| // "Sport Silver Dial Women", 215475290); |
| //"Rectangle Area Rug", 213885290); |
| // "40VA Replacement Transformer", 213085391); |
| // "PSYLLIUM POWDER Food", 213185391); |
| // "Leighton Toilet Tank", 213285391); |
| //"Samsung Knack U310 Flip Phone", 214495493); |
| //"Panasonic Cordless Phone 2 handsets", 214870820); |
| //"Winegard TV Antenna Pre-Amplifier", 211924499); |
| //"Atlona AT-HD-V18 HDMI Distribution Amplifier", 215162612); |
| //"airport express base station", 211462827); |
| //"denon Network Streaming A/V Home Theater receiver", 209565926); |
| //"sherwood receiver 400 watts stereo", 211286714); |
| //"multizone music distribution system", 205333526); |
| //"niles zr4", 215104912); |
| //"alpine waterproof marine cd receiver", 215167695); |
| //"sherwood channel receiver dolby", 215116818); |
| //"sherwood lcd tv widescreen hdtv", 215481917); |
| //"multiroom music distribution system", 205333526); |
| // "fusion ms compact stereo", 215649463); |
| //"pyle pro speaker", 213265125); |
| // "apple iphone 4g", 213265325); |
| //"sherwood high performance receiver", 215394729); |
| //"sony camera housing", 211960592); |
| //"sony xl2100", 1135329203); |
| //"sony 18 megapixel-digital-camera", 215743208); |
| //"sony m470 microcassette tape recorder", 205828052); |
| //"sony monitor terminal expansion board", 213244217); |
| //"sony cybershot digital-camera", 215743207); |
| //"sony interchangeable lens handycam camcorder", 215153503); |
| //"canon powershot digital camera", 214754207); |
| //"brother ink pageyield yellow", 204743189); |
| // ?? "garmin 2200 gps navigator", 215167480); |
| "halo portable backup battery"); |
| |
| ProfileReaderWriter.writeReportListStr(res, "formedReviewSentences4.csv"); |
| |
| |
| /* |
| res= extractor. extractSentencesWithPotentialReviewPhrases(//"http://www.sitbetter.com/view/chair/ofm-500-l/ofm--high-back-leather-office-chair/"); |
| //"http://www.amazon.com/OFM-High-Back-Leather-Integral-Headrest/dp/B002SIW1E0/ref=sr_1_1?ie=UTF8&qid=1353370254&sr=8-1&keywords=OFM-High-Back-Leather-Integral-Headrest"); |
| //"http://www.amazon.com/Oregon-511AX-Chain-Grinder-Sharpener/dp/B0000AX0CY/ref=sr_1_4?s=industrial&ie=UTF8&qid=1353373435&sr=1-4&keywords=chain+saws"); |
| // "http://www.amazon.com/Bearing-UCP204-12-Housing-Mounted-Bearings/dp/B002BBIYWM/ref=sr_1_1?s=industrial&ie=UTF8&qid=1353373786&sr=1-1&keywords=pillow+block+bearing"); |
| "http://www.amazon.com/ShelterLogic-20--Feet-Auto-Shelter/dp/B001OFNK8O/ref=sr_1_1?s=lawn-garden&ie=UTF8&qid=1353376677&sr=1-1&keywords=shelterlogic+62680+autoshelter+portable+garage+carport"); |
| System.out.println(res); |
| */ |
| |
| } |
| } |