blob: 18fc5f74fb15cd17fb0a72e9b84bfcace78f57a6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps.utils;
import java.awt.Graphics2D;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.lang.StringUtils;
public class Utils {
private static final Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.utils.Utils");
protected static final ArrayList<String[]> characterMappings = new ArrayList<String[]>();
static {
characterMappings
.add(new String[] {
"[àáâãäå�?ăą�°]",
" " }); // was a
characterMappings
.add(new String[] {
"[À�?ÂÃÄÅĀĂĄ�?]",
"A" });
characterMappings
.add(new String[] {
"[çćĉċ�?]",
"c" });
characterMappings
.add(new String[] {
"[ÇĆĈĊČ]",
"C" });
characterMappings.add(new String[] {
"[�?đ]", "d" });
characterMappings
.add(new String[] {
"[�?�?]",
"D" });
characterMappings
.add(new String[] {
"[èéêëæęēĕ�ęě]",
" " }); // was e
characterMappings
.add(new String[] {
"[ÈÉÊËÆĒ�ĖĘĚ]",
"'" }); // was E
characterMappings
.add(new String[] {
"[�?ğġģ]",
"g" });
characterMappings
.add(new String[] {
"[ĜĞĠĢƓ]",
"G" });
characterMappings.add(new String[] {
"[ĥħ]", "h" });
characterMappings.add(new String[] {
"[ĤĦ]", "H" });
characterMappings
.add(new String[] {
"[ìíîïĩīĭĮįıijĵ]",
"i" });
characterMappings
.add(new String[] {
"[Ì�?Î�?ĨĪĬİIJĴĵ]",
"I" });
characterMappings.add(new String[] {
"[ķĸ]", "k" });
characterMappings.add(new String[] { "[Ķ]", "K" });
characterMappings
.add(new String[] {
"[øőðòóôõö�?�?őœơ]",
"o" });
characterMappings
.add(new String[] {
"[ÒÓ�ÕÖØŌŎ�?ŒƠ]",
"O" });
characterMappings
.add(new String[] {
"[ñńņňʼnŋ]",
"n" });
characterMappings
.add(new String[] {
"[ÑŃŅŇŊŋ]",
"N" });
characterMappings
.add(new String[] {
"[ĺļľŀł]",
"l" });
characterMappings
.add(new String[] {
"[ĹĻĽĿ�?]",
"L" });
characterMappings
.add(new String[] {
"[ùúûüũūŭůűųư]",
"u" });
characterMappings
.add(new String[] {
"[ÙÚÛÜŨŪŬŮŰŲƯ]",
"U" });
characterMappings.add(new String[] {
"[ýÿŷ]", "y" });
characterMappings
.add(new String[] {
"[�?ŶŸ]",
"Y" });
characterMappings
.add(new String[] {
"[ŕ�ř]",
"r" });
characterMappings
.add(new String[] {
"[�ŖŘ]",
"R" });
characterMappings
.add(new String[] {
"[šś�?şšſ]",
"s" });
characterMappings
.add(new String[] {
"[ŠŚŜŞŠſ]",
"S" });
characterMappings.add(new String[] { "ß", "ss" });
characterMappings.add(new String[] { "Þ", "th" });
characterMappings.add(new String[] { "þ", "Th" });
characterMappings
.add(new String[] {
"[ţťŧ]",
"t" });
characterMappings
.add(new String[] {
"[ŢŤŦ]",
"T" });
characterMappings.add(new String[] { "[ŵ]", "w" });
characterMappings.add(new String[] { "[Å´]", "W" });
characterMappings
.add(new String[] {
"[žźżžƶ]",
"z" });
characterMappings
.add(new String[] {
"[ŽŽŹŻŽƵ]",
"Z" });
characterMappings.add(new String[] {
"[’]", "'" });
characterMappings.add(new String[] {
"[–]", "'" });
characterMappings.add(new String[] { "&#39;", "'" });
characterMappings.add(new String[] { "Âe", "«" });
characterMappings.add(new String[] { "'AG", "“" });
characterMappings.add(new String[] { "A�", " " });
characterMappings.add(new String[] { "&quot;", "\"" });
characterMappings.add(new String[] { "&amp;", "&" });
characterMappings.add(new String[] { "&nbsp;", " " });
characterMappings.add(new String[] {
"", " " });
characterMappings.add(new String[] { "â„¢",
" " });
characterMappings.add(new String[] {
"�", "" });
characterMappings.add(new String[] { "’", "'" });
}
public static String stripNonAsciiChars(String s) {
StringBuffer b = new StringBuffer();
if (s != null) {
for (int i = 0; i < s.length(); i++) {
if (((int) s.charAt(i)) <= 256) {
b.append(s.charAt(i));
}
}
}
return b.toString().trim().replaceAll("\\s+", " "); // replace any multiple
// spaces with a single
// space
}
public static String convertToASCII(String s) {
s = s.replace("&amp", "");
s = s.replaceAll("’", "__apostrophe__");
String tmp = s;
if (tmp != null) {
for (String[] mapping : characterMappings) {
tmp = tmp.replaceAll(mapping[0], mapping[1]);
}
}
return stripNonAsciiChars(tmp.replaceAll("__apostrophe__", "'"));
}
public static class KeyValue {
public Object key = null;
public float value = 0;
public KeyValue(Object o, Float i) {
this.key = o;
this.value = i;
}
public static class SortByValue implements Comparator {
public int compare(Object obj1, Object obj2) {
float i1 = ((KeyValue) obj1).value;
float i2 = ((KeyValue) obj2).value;
if (i1 < i2)
return 1;
return -1;
}
}
}
public static boolean createResizedCopy(String originalImage,
String newImage, int scaledWidth, int scaledHeight) {
boolean retVal = true;
try {
File o = new File(originalImage);
BufferedImage bsrc = ImageIO.read(o);
BufferedImage bdest = new BufferedImage(scaledWidth, scaledHeight,
BufferedImage.TYPE_INT_RGB);
Graphics2D g = bdest.createGraphics();
AffineTransform at = AffineTransform.getScaleInstance(
(double) scaledWidth / bsrc.getWidth(),
(double) scaledHeight / bsrc.getHeight());
g.drawRenderedImage(bsrc, at);
ImageIO.write(bdest, "jpeg", new File(newImage));
} catch (Exception e) {
retVal = false;
LOG.severe("Failed creating thumbnail for image: " + originalImage + e);
}
return retVal;
}
private static int minimum(int a, int b, int c) {
int mi;
mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
public static int computeEditDistance(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
// Step 6
d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ cost);
}
}
// Step 7
return d[n][m];
}
public static ArrayList<KeyValue> sortByValue(HashMap<Object, Float> h) {
ArrayList<KeyValue> res = new ArrayList<KeyValue>();
for (Object o : h.keySet()) {
// form a pair
res.add(new KeyValue(o, h.get(o)));
}
Collections.sort(res, new KeyValue.SortByValue());
return res;
}
public static String convertKeyValueToString(ArrayList<KeyValue> l) {
StringBuffer retVal = new StringBuffer();
for (KeyValue kv : l) {
retVal.append(kv.key);
retVal.append("-");
retVal.append(kv.value);
retVal.append(",");
}
return retVal.toString();
}
public static String convertStringArrayToString(ArrayList<String> l) {
StringBuffer b = new StringBuffer();
for (String s : l) {
b.append(s);
b.append(", ");
}
return b.toString();
}
public static String convertStringArrayToPlainString(ArrayList<String> l) {
StringBuffer b = new StringBuffer();
for (String s : l) {
b.append(s);
b.append(" ");
}
return b.toString();
}
public static boolean noDomainInUrl(String siteUrl, String url) {
if (StringUtils.isEmpty(url)) {
return true;
}
if (!url.startsWith("http://")) {
return true;
}
return false;
}
public static String addDomainToUrl(String siteUrl, String url) {
if (StringUtils.isEmpty(url)) {
return null; // should we return siteUrl here ??
}
if (!url.startsWith("http://")) {
String domain = StringUtils.substringBetween(siteUrl, "http://", "/");
if (domain == null) {
url = siteUrl + (url.startsWith("/") ? "" : "/") + url;
} else {
if (!url.startsWith("/")) {
int lastIndex = StringUtils.lastIndexOf(siteUrl, "/");
url = siteUrl.substring(0, lastIndex) + "/" + url;
} else {
url = "http://" + domain + url;
}
}
}
return url;
}
public static int countValues(Hashtable<String, Float> b1) {
int retVal = 0;
for (String s : b1.keySet()) {
retVal += b1.get(s);
}
return retVal;
}
public static int countValues(HashMap<String, Integer> b1) {
int retVal = 0;
for (String s : b1.keySet()) {
retVal += b1.get(s);
}
return retVal;
}
public static String convertHashMapToString(HashMap<String, Integer> m) {
StringBuffer s = new StringBuffer();
for (String x : m.keySet()) {
s.append(x);
s.append("-");
s.append(m.get(x));
s.append(",");
}
return s.toString();
}
public static boolean isTokenAllDigitOrPunc(String token) {
for (int i = 0; i < token.length(); i++) {
if (java.lang.Character.isLetter(token.charAt(i))) {
return false;
}
}
return true;
}
public static boolean containsDigit(String token) {
for (int i = 0; i < token.length(); i++) {
if (java.lang.Character.isDigit(token.charAt(i))) {
return true;
}
}
return false;
}
public static String CleanCharacter(String txt, int uValue) {
StringBuffer retVal = new StringBuffer();
for (int i = 0; i < txt.length(); i++) {
int uChar = (txt.charAt(i));
if (uChar != uValue) {
retVal.append(txt.charAt(i));
} else {
retVal.append(" ");
}
}
return retVal.toString();
}
public static String removeHTMLTagsFromStr(String inputStr) {
String[] removeTags = StringUtils.substringsBetween(inputStr, "<", ">");
if (removeTags != null && removeTags.length > 0) {
for (String tag : removeTags) {
inputStr = StringUtils.remove(inputStr, "<" + tag + ">");
}
}
return inputStr;
}
public static String unescapeHTML(String text) {
return org.apache.commons.lang.StringEscapeUtils.unescapeHtml(text);
}
public static String stripHTML(String text) {
return text.replaceAll("\\<.*?>", "");
}
public static String stripScriptTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<SCRIPT.*?</SCRIPT>",
Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
}
public static String stripNoScriptTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<NOSCRIPT.*?</NOSCRIPT>",
Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
}
public static String stripHTMLMultiLine(String text,
HashSet<String> allowedHtmlTags, String escGtCh, String escLtCh) {
if (StringUtils.isNotEmpty(text)) {
boolean hadAllowedHtmlTags = false;
if (allowedHtmlTags != null) {
for (String htmlTag : allowedHtmlTags) {
String tmp = text.replaceAll("<" + htmlTag + ">", escLtCh + htmlTag
+ escGtCh);
tmp = tmp.replaceAll("</" + htmlTag + ">", escLtCh + "/" + htmlTag
+ escGtCh);
if (!tmp.equals(text)) {
text = tmp;
hadAllowedHtmlTags = true;
}
}
}
text = stripHTMLMultiLine(text);
if (hadAllowedHtmlTags) {
text = text.replaceAll(escLtCh, "<");
text = text.replaceAll(escGtCh, ">");
}
}
return text;
}
public static String stripHTMLMultiLine(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<.*?>", Pattern.DOTALL);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
}
public static String stripHTMLCommentsMultiLine(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<!--.*?-->", Pattern.DOTALL);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
}
public static boolean isFlagSet(Integer flags, Integer flagToCheck) {
if (flags != null && flagToCheck != null) {
return ((flags & flagToCheck) == flagToCheck);
}
return false;
}
public static Integer updateFlag(Integer flags, Integer flagToCheck,
boolean shouldSet) {
if (shouldSet) {
return setFlag(flags, flagToCheck);
} else {
return resetFlag(flags, flagToCheck);
}
}
public static Integer setFlag(Integer flags, Integer flagToCheck) {
if (flags == null) {
flags = new Integer(0);
}
if (!isFlagSet(flags, flagToCheck)) {
flags = flags + flagToCheck;
;
}
return flags;
}
public static Integer resetFlag(Integer flags, Integer flagToCheck) {
if (flags == null) {
// nothing to reset
flags = new Integer(0);
return flags;
}
if (isFlagSet(flags, flagToCheck)) {
flags = flags - flagToCheck;
}
return flags;
}
public static String truncateOnSpace(String text, Integer length) {
String retVal = "";
if (text.length() <= length) {
retVal = text;
} else {
StringBuffer b = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
if (b.length() >= length && Character.isWhitespace(text.charAt(i))) { // iterate
// until
// we
// hit
// whitespace
b.append("...");
break;
}
b.append(text.charAt(i));
}
retVal = b.toString();
}
return retVal.trim();
}
public static String sanitizeString(String text) {
text = Utils.stripHTMLCommentsMultiLine(text);
text = Utils.stripHTMLMultiLine(text);
text = Utils.unescapeHTML(text);
text = StringUtils.trimToEmpty(text);
text = text.replaceAll("\\s+", " ");
return text;
}
public static String makeStringUrlSafe(String text) {
StringBuffer b = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
if (StringUtils.isAlphanumericSpace(String.valueOf(text.charAt(i)))) {
b.append(text.charAt(i));
}
}
return Utils.convertToASCII(b.toString().replaceAll("\\s+", " "));
}
public static String getEventIdFromNewsUrl(String url) {
String eventId = null;
String p = "news/([0-9]+)";
Pattern pattern = Pattern.compile(p);
Matcher matcher = pattern.matcher(url);
while (matcher.find()) {
// System.out.println("found: " + matcher.group(2));
eventId = matcher.group(1);
}
return eventId;
}
public static String buildCommaSeparatedIds(List ids) {
if (ids != null && ids.size() > 0) {
StringBuffer sbuf = new StringBuffer();
for (int count = 0; count < ids.size(); count++) {
if (count > 0) {
sbuf.append(",");
}
sbuf.append(ids.get(count));
}
return sbuf.toString();
}
return null;
}
public static float computeScoreForRanking(List<Float> scores,
int desiredRanking) {
float newScore = 0f;
if (desiredRanking == 1) {
newScore = scores.get(0) + 50000;
} else if (desiredRanking == scores.size()) {
newScore = scores.get(scores.size() - 1) - 1;
} else {
newScore = (scores.get(desiredRanking - 2) + scores
.get(desiredRanking - 1)) / 2;
}
return newScore;
}
public static String fullStripHTML(String text) {
text = Utils.stripScriptTags(text);
text = Utils.stripNoScriptTags(text);
text = Utils.stripStyleTags(text);
return text.replaceAll("\\<.*?>", "");
}
public static String stripStyleTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<STYLE.*?</STYLE>",
Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
}
public static boolean isLatinWord(String word) {
for (int i = 0; i < word.length(); i++) {
int asciiCode = (int) word.charAt(i);
if (asciiCode > 128)
return false;
}
return true;
}
static public void main(String[] args) {
System.out.println(isLatinWord("Performing Arts Center (SPAC)"));
System.out.println(isLatinWord("“Jazz Age�"));
System.out
.println(isLatinWord("デービッド・ã"));
System.out
.println(isLatinWord("é ñçøåó"));
System.out
.println(isLatinWord("ùìîä à øöé"));
System.out
.println(isLatinWord("陳港�, 陈港�"));
System.out
.println(convertToASCII("Irvine Bay Hotel & Golf Club on Sunday, May 01 during Jazz on the Beach, Tobago Jazz Experience alongside The Jazz Singer"));
System.out
.println(convertToASCII("This year’s event, held again at the wonderful Saratoga Performing Arts Center (SPAC)"));
System.out
.println(convertToASCII("and the great saxophone playing of Sam Rogers Rush Hour Blues 2010   . "));
System.out
.println(convertToASCII("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Ron Carter is among the most original, prolific "));
System.out
.println(convertToASCII(" .             Ron Carter is among the most original, prolific. "));
// TODO deal with
// www.wmot.org/program-guide/program-listings/28th_annual_playboy_jazz_festiva_2006.htm
System.out
.println(convertToASCII("By the mid 1920’s, during the period referred to as the “Jazz Age�, jazz music was heard in most major cities from the East Coast"));
}
}