blob: eb9be1d576267f0593063566f80e3a27ee004f7f [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.awt.Graphics2D;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.lang.StringUtils;
public class Utils {
private static final Logger LOG = Logger.getLogger("");
protected static final ArrayList<String[]> characterMappings = new ArrayList<String[]>();
static {
.add(new String[] {
" " }); // was a
.add(new String[] {
"A" });
characterMappings.add(new String[] {
"[çćĉċ�?]", "c" });
characterMappings.add(new String[] {
"[ÇĆĈĊČ]", "C" });
characterMappings.add(new String[] { "[�?đ]", "d" });
characterMappings.add(new String[] {
"[�?�?]", "D" });
.add(new String[] {
" " }); // was e
.add(new String[] {
"'" }); // was E
characterMappings.add(new String[] {
"[�?ğġģ]", "g" });
characterMappings.add(new String[] {
"[ĜĞĠĢƓ]", "G" });
characterMappings.add(new String[] { "[ĥħ]", "h" });
characterMappings.add(new String[] { "[ĤĦ]", "H" });
.add(new String[] {
"i" });
.add(new String[] {
"I" });
characterMappings.add(new String[] { "[ķĸ]", "k" });
characterMappings.add(new String[] { "[Ķ]", "K" });
.add(new String[] {
"o" });
.add(new String[] {
"O" });
characterMappings.add(new String[] {
"n" });
characterMappings.add(new String[] {
"N" });
characterMappings.add(new String[] {
"[ĺļľŀł]", "l" });
characterMappings.add(new String[] {
"[ĹĻĽĿ�?]", "L" });
.add(new String[] {
"u" });
.add(new String[] {
"U" });
characterMappings.add(new String[] { "[ýÿŷ]", "y" });
characterMappings.add(new String[] { "[�?ŶŸ]",
"Y" });
characterMappings.add(new String[] {
"[ŕ�ř]", "r" });
characterMappings.add(new String[] {
"[�ŖŘ]", "R" });
.add(new String[] {
"s" });
characterMappings.add(new String[] {
"[ŠŚŜŞŠſ]", "S" });
characterMappings.add(new String[] { "ß", "ss" });
characterMappings.add(new String[] { "Þ", "th" });
characterMappings.add(new String[] { "þ", "Th" });
.add(new String[] { "[ţťŧ]", "t" });
.add(new String[] { "[ŢŤŦ]", "T" });
characterMappings.add(new String[] { "[ŵ]", "w" });
characterMappings.add(new String[] { "[Å´]", "W" });
characterMappings.add(new String[] {
"[žźżžƶ]", "z" });
characterMappings.add(new String[] {
"[ŽŽŹŻŽƵ]", "Z" });
characterMappings.add(new String[] { "[’]", "'" });
characterMappings.add(new String[] { "[–]", "'" });
characterMappings.add(new String[] { "&#39;", "'" });
characterMappings.add(new String[] { "Âe", "«" });
characterMappings.add(new String[] { "'AG", "“" });
characterMappings.add(new String[] { "A�", " " });
characterMappings.add(new String[] { "&quot;", "\"" });
characterMappings.add(new String[] { "&amp;", "&" });
characterMappings.add(new String[] { "&nbsp;", " " });
characterMappings.add(new String[] { "", " " });
characterMappings.add(new String[] { "â„¢", " " });
characterMappings.add(new String[] { "�", "" });
characterMappings.add(new String[] { "’", "'" });
public static String stripNonAsciiChars(String s) {
StringBuffer b = new StringBuffer();
if (s != null) {
for (int i = 0; i < s.length(); i++) {
if (((int) s.charAt(i)) <= 256) {
return b.toString().trim().replaceAll("\\s+", " "); // replace any multiple
// spaces with a single
// space
public static String convertToASCII(String s) {
s = s.replace("&amp", "");
s = s.replaceAll("’", "__apostrophe__");
String tmp = s;
if (tmp != null) {
for (String[] mapping : characterMappings) {
tmp = tmp.replaceAll(mapping[0], mapping[1]);
return stripNonAsciiChars(tmp.replaceAll("__apostrophe__", "'"));
public static class KeyValue {
public Object key = null;
public float value = 0;
public KeyValue(Object o, Float i) {
this.key = o;
this.value = i;
public static class SortByValue implements Comparator {
public int compare(Object obj1, Object obj2) {
float i1 = ((KeyValue) obj1).value;
float i2 = ((KeyValue) obj2).value;
if (i1 < i2)
return 1;
return -1;
public static boolean createResizedCopy(String originalImage,
String newImage, int scaledWidth, int scaledHeight) {
boolean retVal = true;
try {
File o = new File(originalImage);
BufferedImage bsrc =;
BufferedImage bdest = new BufferedImage(scaledWidth, scaledHeight,
Graphics2D g = bdest.createGraphics();
AffineTransform at = AffineTransform.getScaleInstance(
(double) scaledWidth / bsrc.getWidth(),
(double) scaledHeight / bsrc.getHeight());
g.drawRenderedImage(bsrc, at);
ImageIO.write(bdest, "jpeg", new File(newImage));
} catch (Exception e) {
retVal = false;
LOG.severe("Failed creating thumbnail for image: " + originalImage + e);
return retVal;
private static int minimum(int a, int b, int c) {
int mi;
mi = a;
if (b < mi) {
mi = b;
if (c < mi) {
mi = c;
return mi;
public static int computeEditDistance(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length();
m = t.length();
if (n == 0) {
return m;
if (m == 0) {
return n;
d = new int[n + 1][m + 1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
for (j = 0; j <= m; j++) {
d[0][j] = j;
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
// Step 6
d[i][j] = minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ cost);
// Step 7
return d[n][m];
public static ArrayList<KeyValue> sortByValue(HashMap<Object, Float> h) {
ArrayList<KeyValue> res = new ArrayList<KeyValue>();
for (Object o : h.keySet()) {
// form a pair
res.add(new KeyValue(o, h.get(o)));
Collections.sort(res, new KeyValue.SortByValue());
return res;
public static String convertKeyValueToString(ArrayList<KeyValue> l) {
StringBuffer retVal = new StringBuffer();
for (KeyValue kv : l) {
return retVal.toString();
public static String convertStringArrayToString(ArrayList<String> l) {
StringBuffer b = new StringBuffer();
for (String s : l) {
b.append(", ");
return b.toString();
public static String convertStringArrayToPlainString(ArrayList<String> l) {
StringBuffer b = new StringBuffer();
for (String s : l) {
b.append(" ");
return b.toString();
public static boolean noDomainInUrl(String siteUrl, String url) {
if (StringUtils.isEmpty(url)) {
return true;
if (!url.startsWith("http://")) {
return true;
return false;
public static String addDomainToUrl(String siteUrl, String url) {
if (StringUtils.isEmpty(url)) {
return null; // should we return siteUrl here ??
if (!url.startsWith("http://")) {
String domain = StringUtils.substringBetween(siteUrl, "http://", "/");
if (domain == null) {
url = siteUrl + (url.startsWith("/") ? "" : "/") + url;
} else {
if (!url.startsWith("/")) {
int lastIndex = StringUtils.lastIndexOf(siteUrl, "/");
url = siteUrl.substring(0, lastIndex) + "/" + url;
} else {
url = "http://" + domain + url;
return url;
public static int countValues(Hashtable<String, Float> b1) {
int retVal = 0;
for (String s : b1.keySet()) {
retVal += b1.get(s);
return retVal;
public static int countValues(HashMap<String, Integer> b1) {
int retVal = 0;
for (String s : b1.keySet()) {
retVal += b1.get(s);
return retVal;
public static String convertHashMapToString(HashMap<String, Integer> m) {
StringBuffer s = new StringBuffer();
for (String x : m.keySet()) {
return s.toString();
public static boolean isTokenAllDigitOrPunc(String token) {
for (int i = 0; i < token.length(); i++) {
if (java.lang.Character.isLetter(token.charAt(i))) {
return false;
return true;
public static boolean containsDigit(String token) {
for (int i = 0; i < token.length(); i++) {
if (java.lang.Character.isDigit(token.charAt(i))) {
return true;
return false;
public static String CleanCharacter(String txt, int uValue) {
StringBuffer retVal = new StringBuffer();
for (int i = 0; i < txt.length(); i++) {
int uChar = (txt.charAt(i));
if (uChar != uValue) {
} else {
retVal.append(" ");
return retVal.toString();
public static String removeHTMLTagsFromStr(String inputStr) {
String[] removeTags = StringUtils.substringsBetween(inputStr, "<", ">");
if (removeTags != null && removeTags.length > 0) {
for (String tag : removeTags) {
inputStr = StringUtils.remove(inputStr, "<" + tag + ">");
return inputStr;
public static String unescapeHTML(String text) {
return org.apache.commons.lang.StringEscapeUtils.unescapeHtml(text);
public static String stripHTML(String text) {
return text.replaceAll("\\<.*?>", "");
public static String stripScriptTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<SCRIPT.*?</SCRIPT>",
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
public static String stripNoScriptTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<NOSCRIPT.*?</NOSCRIPT>",
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
public static String stripHTMLMultiLine(String text,
HashSet<String> allowedHtmlTags, String escGtCh, String escLtCh) {
if (StringUtils.isNotEmpty(text)) {
boolean hadAllowedHtmlTags = false;
if (allowedHtmlTags != null) {
for (String htmlTag : allowedHtmlTags) {
String tmp = text.replaceAll("<" + htmlTag + ">", escLtCh + htmlTag
+ escGtCh);
tmp = tmp.replaceAll("</" + htmlTag + ">", escLtCh + "/" + htmlTag
+ escGtCh);
if (!tmp.equals(text)) {
text = tmp;
hadAllowedHtmlTags = true;
text = stripHTMLMultiLine(text);
if (hadAllowedHtmlTags) {
text = text.replaceAll(escLtCh, "<");
text = text.replaceAll(escGtCh, ">");
return text;
public static String stripHTMLMultiLine(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<.*?>", Pattern.DOTALL);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
public static String stripHTMLCommentsMultiLine(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<!--.*?-->", Pattern.DOTALL);
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
public static boolean isFlagSet(Integer flags, Integer flagToCheck) {
if (flags != null && flagToCheck != null) {
return ((flags & flagToCheck) == flagToCheck);
return false;
public static Integer updateFlag(Integer flags, Integer flagToCheck,
boolean shouldSet) {
if (shouldSet) {
return setFlag(flags, flagToCheck);
} else {
return resetFlag(flags, flagToCheck);
public static Integer setFlag(Integer flags, Integer flagToCheck) {
if (flags == null) {
flags = new Integer(0);
if (!isFlagSet(flags, flagToCheck)) {
flags = flags + flagToCheck;
return flags;
public static Integer resetFlag(Integer flags, Integer flagToCheck) {
if (flags == null) {
// nothing to reset
flags = new Integer(0);
return flags;
if (isFlagSet(flags, flagToCheck)) {
flags = flags - flagToCheck;
return flags;
public static String truncateOnSpace(String text, Integer length) {
String retVal = "";
if (text.length() <= length) {
retVal = text;
} else {
StringBuffer b = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
if (b.length() >= length && Character.isWhitespace(text.charAt(i))) { // iterate
// until
// we
// hit
// whitespace
retVal = b.toString();
return retVal.trim();
public static String sanitizeString(String text) {
text = Utils.stripHTMLCommentsMultiLine(text);
text = Utils.stripHTMLMultiLine(text);
text = Utils.unescapeHTML(text);
text = StringUtils.trimToEmpty(text);
text = text.replaceAll("\\s+", " ");
return text;
public static String makeStringUrlSafe(String text) {
StringBuffer b = new StringBuffer();
for (int i = 0; i < text.length(); i++) {
if (StringUtils.isAlphanumericSpace(String.valueOf(text.charAt(i)))) {
return Utils.convertToASCII(b.toString().replaceAll("\\s+", " "));
public static String getEventIdFromNewsUrl(String url) {
String eventId = null;
String p = "news/([0-9]+)";
Pattern pattern = Pattern.compile(p);
Matcher matcher = pattern.matcher(url);
while (matcher.find()) {
// System.out.println("found: " +;
eventId =;
return eventId;
public static String buildCommaSeparatedIds(List ids) {
if (ids != null && ids.size() > 0) {
StringBuffer sbuf = new StringBuffer();
for (int count = 0; count < ids.size(); count++) {
if (count > 0) {
return sbuf.toString();
return null;
public static float computeScoreForRanking(List<Float> scores,
int desiredRanking) {
float newScore = 0f;
if (desiredRanking == 1) {
newScore = scores.get(0) + 50000;
} else if (desiredRanking == scores.size()) {
newScore = scores.get(scores.size() - 1) - 1;
} else {
newScore = (scores.get(desiredRanking - 2) + scores
.get(desiredRanking - 1)) / 2;
return newScore;
public static String fullStripHTML(String text) {
text = Utils.stripScriptTags(text);
text = Utils.stripNoScriptTags(text);
text = Utils.stripStyleTags(text);
return text.replaceAll("\\<.*?>", "");
public static String stripStyleTags(String text) {
Pattern p = java.util.regex.Pattern.compile("\\<STYLE.*?</STYLE>",
Matcher matcher = p.matcher(text);
String tmp = matcher.replaceAll("");
return tmp;
public static boolean isLatinWord(String word) {
for (int i = 0; i < word.length(); i++) {
int asciiCode = (int) word.charAt(i);
if (asciiCode > 128)
return false;
return true;
static public void main(String[] args) {
System.out.println(isLatinWord("Performing Arts Center (SPAC)"));
System.out.println(isLatinWord("“Jazz Age�"));
System.out.println(isLatinWord("é ñçøåó"));
System.out.println(isLatinWord("ùìîä à øöé"));
.println(isLatinWord("陳港�, 陈港�"));
.println(convertToASCII("Irvine Bay Hotel & Golf Club on Sunday, May 01 during Jazz on the Beach, Tobago Jazz Experience alongside The Jazz Singer"));
.println(convertToASCII("This year’s event, held again at the wonderful Saratoga Performing Arts Center (SPAC)"));
.println(convertToASCII("and the great saxophone playing of Sam Rogers Rush Hour Blues 2010   . "));
.println(convertToASCII("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Ron Carter is among the most original, prolific "));
.println(convertToASCII(" .             Ron Carter is among the most original, prolific. "));
// TODO deal with
.println(convertToASCII("By the mid 1920’s, during the period referred to as the “Jazz Age�, jazz music was heard in most major cities from the East Coast"));