blob: 4f0512bbec80609c3f56bd88b9180e18cabe8e8c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.apps;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.similarity.apps.utils.Utils;
import org.apache.commons.lang.StringUtils;
public class MinedSentenceProcessor {
public static String acceptableMinedSentence(String sent) {
// if too many commas => seo text
String[] commas = StringUtils.split(sent, ',');
String[] spaces = StringUtils.split(sent, ' ');
if ((float) commas.length / (float) spaces.length > 0.7) {
System.out.println("Rejection: too many commas");
return null;
}
String[] otherDelimiters = StringUtils.split(sent, '/');
if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
System.out.println("Rejection: too many delimiters");
return null;
}
otherDelimiters = StringUtils.split(sent, '.');
if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
System.out.println("Rejection: too many delimiters");
return null;
}
otherDelimiters = StringUtils.split(sent, '!');
if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
System.out.println("Rejection: too many delimiters");
return null;
}
otherDelimiters = StringUtils.split(sent, '=');
if ((float) otherDelimiters.length / (float) spaces.length > 0.7) {
System.out.println("Rejection: too many delimiters");
return null;
}
String[] pipes = StringUtils.split(sent, '|');
if (StringUtils.split(sent, '|').length > 2
|| StringUtils.split(sent, '>').length > 2) {
System.out.println("Rejection: too many |s or >s ");
return null;
}
String sentTry = sent.toLowerCase();
// if too many long spaces
String sentSpaces = sentTry.replace(" ", "");
if (sentSpaces.length() - sentTry.length() > 10) // too many spaces -
// suspicious
return null;
if (sentTry.indexOf("click here") > -1 || sentTry.indexOf(" wikip") > -1
|| sentTry.indexOf("copyright") > -1
|| sentTry.indexOf("operating hours") > -1
|| sentTry.indexOf("days per week") > -1
|| sentTry.indexOf("click for") > -1 || sentTry.indexOf("photos") > -1
|| sentTry.indexOf("find the latest") > -1
|| sentTry.startsWith("subscribe")
|| sentTry.indexOf("Terms of Service") > -1
|| sentTry.indexOf("clicking here") > -1
|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
|| sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
|| sentTry.indexOf("available online") > -1
|| sentTry.indexOf("get online") > -1
|| sentTry.indexOf("buy online") > -1
|| sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1
|| sentTry.indexOf("official site") > -1
|| sentTry.indexOf("this video") > -1
|| sentTry.indexOf("this book") > -1
|| sentTry.indexOf("this product") > -1
|| sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1
|| sentTry.indexOf("audio cd") > -1
|| sentTry.indexOf("related searches") > -1
|| sentTry.indexOf("permission is granted") > -1
|| sentTry.indexOf("[edit") > -1
|| sentTry.indexOf("edit categories") > -1
|| sentTry.indexOf("free license") > -1
|| sentTry.indexOf("permission is granted") > -1
|| sentTry.indexOf("under the terms") > -1
|| sentTry.indexOf("rights reserved") > -1
|| sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")
|| sentTry.endsWith("the.") || sentTry.startsWith("below")
|| sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1
||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1
||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1
||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1
||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1
||sentTry.startsWith( "we ") ||sentTry.indexOf( "the recipient")>-1 ||sentTry.indexOf( "day return")>-1 ||sentTry.indexOf( "days return")>-1
||sentTry.startsWith( "fax") ||sentTry.indexOf( "refund it")>-1 || sentTry.indexOf( "your money")>-1
||sentTry.startsWith( "free") ||sentTry.indexOf( "purchase orders")>-1
||sentTry.startsWith( "exchange it ") ||sentTry.indexOf( "return it")>-1 ||sentTry.indexOf( "credit card")>-1
||sentTry.indexOf( "storeshop")>-1 || sentTry.startsWith( "find") || sentTry.startsWith( "shop") || sentTry.startsWith( "unlimited")
||sentTry.indexOf( "for a limited time")>-1 ||sentTry.indexOf( "prime members")>-1 ||sentTry.indexOf( "amazon members")>-1 ||sentTry.indexOf( "unlimited free")>-1
||sentTry.indexOf( "shipping")>-1 || sentTry.startsWith( "amazon")
// not a script text
||sentTry.indexOf( "document.body")>-1 ||sentTry.indexOf( " var ")>-1 ||sentTry.indexOf( "search suggestions")>-1 ||sentTry.startsWith( "Search")
)
return null;
//Millions of Amazon Prime members enjoy instant videos, free Kindle books and unlimited free two-day shipping.
// count symbols indicating wrong parts of page to mine for text
// if short and contains too many symbols indicating wrong area: reject
String sentWrongSym = sentTry.replace(">", "&&&").replace("�", "&&&")
.replace("|", "&&&").replace(":", "&&&").replace("/", "&&&")
.replace("-", "&&&").replace("%", "&&&");
if ((sentWrongSym.length() - sentTry.length()) >= 4
&& sentTry.length() < 200) // twice ot more
return null;
sent = sent.replace('[', ' ').replace(']', ' ')
.replace("_should_find_orig_", "").replace(". .", ". ")
.replace("amp;", " ").replace("1.", " ").replace("2.", " ")
.replace("3.", " ").replace("4.", " ").replace("2009", "2011")
.replace("2008", "2011").replace("2006", "2011")
.replace("2007", "2011").replace("VIDEO:", " ").replace("Video:", " ")
.replace("no comments", " ").replace(" ", " ").replace(" ", " ")
.replace("(more.)", "").replace("more.", "").replace("<more>", "")
.replace("[more]", "").replace(".,", ".").replace("&lt;", "")
.replace("p&gt;", "").replace("product description", "");
// TODO .replace("a.", ".");
int endIndex = sent.indexOf(" posted");
if (endIndex > 0)
sent = sent.substring(0, endIndex);
return sent;
}
public static String processSentence(String pageSentence) {
if (pageSentence == null)
return "";
pageSentence = Utils.fullStripHTML(pageSentence);
pageSentence = StringUtils.chomp(pageSentence, "..");
pageSentence = StringUtils.chomp(pageSentence, ". .");
pageSentence = StringUtils.chomp(pageSentence, " .");
pageSentence = StringUtils.chomp(pageSentence, ".");
pageSentence = StringUtils.chomp(pageSentence, "...");
pageSentence = StringUtils.chomp(pageSentence, " ....");
pageSentence = pageSentence.replace("::", ":").replace(".,", ". ")
.replace("(.)", "");
pageSentence = pageSentence.trim();
pageSentence = pageSentence.replaceAll("\\s+", " "); // make single
// spaces
// everywhere
String[] pipes = StringUtils.split(pageSentence, '|'); // removed
// shorter part
// of sentence
// at the end
// after pipe
if (pipes.length == 2
&& ((float) pipes[0].length() / (float) pipes[1].length() > 3.0)) {
int pipePos = pageSentence.indexOf("|");
if (pipePos > -1)
pageSentence = pageSentence.substring(0, pipePos - 1).trim();
}
if (!StringUtils.contains(pageSentence, '.')
&& !StringUtils.contains(pageSentence, '?')
&& !StringUtils.contains(pageSentence, '!'))
pageSentence = pageSentence + ". ";
pageSentence = pageSentence.replace(" .", ".").replace("..", ".").trim();
if (!pageSentence.endsWith("."))
pageSentence += ". ";
return pageSentence;
}
public static String normalizeForSentenceSplitting(String pageContent) {
pageContent.replace("Jan.", "January").replace("Feb.", "February")
.replace("Mar.", "March").replace("Apr.", "April")
.replace("Jun.", "June").replace("Jul.", "July")
.replace("Aug.", "August").replace("Sep.", "September")
.replace("Oct.", "October").replace("Nov.", "November")
.replace("Dec.", "December");
return pageContent;
}
}