blob: 37d83aaa4f61a9e4eca2f07e311c0855115f4ab1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.textsimilarity;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.stemmer.PorterStemmer;
import opennlp.tools.similarity.apps.utils.Pair;
import org.apache.commons.lang.StringUtils;
public class TextProcessor {
private static final Logger LOG = Logger
.getLogger("opennlp.tools.textsimilarity.TextProcessor");
static final String[] abbrevs = { "mr.", "mrs.", "sen.", "rep.", "gov.",
"miss.", "dr.", "oct.", "nov.", "jan.", "feb.", "mar.", "apr.", "may",
"jun.", "jul.", "aug.", "sept." };
public static void removeCommonPhrases(ArrayList<String> segments) {
ArrayList<Pair<List<String>, Map<String, HashSet<Integer>>>> docs = new ArrayList<Pair<List<String>, Map<String, HashSet<Integer>>>>();
// tokenize each segment
for (int i = 0; i < segments.size(); i++) {
String s = segments.get(i);
Pair<List<String>, Map<String, HashSet<Integer>>> tokPos = buildTokenPositions(s);
docs.add(tokPos);
}
HashMap<String, HashSet<Integer>> commonSegments = new HashMap<String, HashSet<Integer>>();
// now we have all documents and the token positions
for (int i = 0; i < docs.size(); i++) {
Pair<List<String>, Map<String, HashSet<Integer>>> objA = docs.get(i);
for (int k = i + 1; k < docs.size(); k++) {
Pair<List<String>, Map<String, HashSet<Integer>>> objB = docs.get(k);
HashSet<String> segs = extractCommonSegments(objA, objB, 4);
for (String seg : segs) {
// System.out.println(seg);
if (commonSegments.containsKey(seg)) {
HashSet<Integer> docIds = commonSegments.get(seg);
docIds.add(i);
docIds.add(k);
commonSegments.put(seg, docIds);
} else {
HashSet<Integer> docIds = new HashSet<Integer>();
docIds.add(i);
docIds.add(k);
commonSegments.put(seg, docIds); // set frequency to two, since both
// these docs contain this
// segment
}
}
}
}
System.out.println(segments.size() + " docs");
// now we have the segments and their frequencies
for (String seg : commonSegments.keySet()) {
System.out.println(seg + ":" + commonSegments.get(seg).size());
}
}
public static HashSet<String> extractCommonSegments(String s1, String s2,
Integer segSize) {
Pair<List<String>, Map<String, HashSet<Integer>>> o1 = buildTokenPositions(s1);
Pair<List<String>, Map<String, HashSet<Integer>>> o2 = buildTokenPositions(s2);
return extractCommonSegments(o1, o2, segSize);
}
private static HashSet<String> extractCommonSegments(
Pair<List<String>, Map<String, HashSet<Integer>>> objA,
Pair<List<String>, Map<String, HashSet<Integer>>> objB, Integer segSize) {
HashSet<String> commonSegments = new HashSet<String>();
List<String> tokensA = objA.getFirst();
Map<String, HashSet<Integer>> tokenPosB = objB.getSecond();
HashSet<Integer> lastPositions = null;
int segLength = 1;
StringBuffer segmentStr = new StringBuffer();
for (int i = 0; i < tokensA.size(); i++) {
String token = tokensA.get(i);
HashSet<Integer> positions = null;
// if ((positions = tokenPosB.get(token)) != null &&
// !token.equals("<punc>") &&
// !StopList.getInstance().isStopWord(token) && token.length()>1) {
if ((positions = tokenPosB.get(token)) != null) {
// we have a list of positions
if (lastPositions != null) {
// see if there is overlap in positions
if (hasNextPosition(lastPositions, positions)) {
segLength++;
commonSegments.remove(segmentStr.toString().trim());
segmentStr.append(" ");
segmentStr.append(token);
if (StringUtils.countMatches(segmentStr.toString(), " ") >= segSize) {
commonSegments.add(segmentStr.toString().trim());
}
lastPositions = positions;
} else {
// did not find segment, reset
segLength = 1;
segmentStr.setLength(0);
lastPositions = null;
}
} else {
lastPositions = positions;
segmentStr.append(" ");
segmentStr.append(token);
}
} else {
// did not find segment, reset
segLength = 1;
segmentStr.setLength(0);
lastPositions = null;
}
}
return commonSegments;
}
private static boolean hasNextPosition(HashSet<Integer> positionsA,
HashSet<Integer> positionsB) {
boolean retVal = false;
for (Integer pos : positionsA) {
Integer nextIndex = pos + 1;
if (positionsB.contains(nextIndex)) {
retVal = true;
break;
}
}
return retVal;
}
public static Pair<List<String>, Map<String, HashSet<Integer>>> buildTokenPositions(
String s) {
String[] toks = StringUtils.split(s);
List<String> list = Arrays.asList(toks);
ArrayList<String> tokens = new ArrayList<String>(list);
Map<String, HashSet<Integer>> theMap = new HashMap<String, HashSet<Integer>>();
for (int i = 0; i < tokens.size(); i++) {
HashSet<Integer> pos = null;
String token = tokens.get(i);
if ((pos = theMap.get(token)) != null) {
pos.add(i);
} else {
pos = new HashSet<Integer>();
pos.add(i);
}
theMap.put(token, pos);
}
return new Pair<List<String>, Map<String, HashSet<Integer>>>(tokens, theMap);
}
public static boolean isStringAllPunc(String token) {
for (int i = 0; i < token.length(); i++) {
if (Character.isLetterOrDigit(token.charAt(i))) {
return false;
}
}
return true;
}
/**
* Splits input text into sentences.
*
* @param txt
* Input text
* @return List of sentences
*/
public static ArrayList<String> splitToSentences(String text) {
ArrayList<String> sentences = new ArrayList<String>();
if (text.trim().length() > 0) {
String s = "[\\?!\\.]\"?[\\s+][A-Z0-9i]";
text += " XOXOX.";
Pattern p = Pattern.compile(s, Pattern.MULTILINE);
Matcher m = p.matcher(text);
int idx = 0;
String cand = "";
// while(m.find()){
// System.out.println(m.group());
// }
while (m.find()) {
cand += " " + text.substring(idx, m.end() - 1).trim();
boolean hasAbbrev = false;
for (int i = 0; i < abbrevs.length; i++) {
if (cand.toLowerCase().endsWith(abbrevs[i])) {
hasAbbrev = true;
break;
}
}
if (!hasAbbrev) {
sentences.add(cand.trim());
cand = "";
}
idx = m.end() - 1;
}
if (idx < text.length()) {
sentences.add(text.substring(idx).trim());
}
if (sentences.size() > 0) {
sentences.set(sentences.size() - 1, sentences.get(sentences.size() - 1)
.replace(" XOXOX.", ""));
}
}
return sentences;
}
private static boolean isSafePunc(char[] chars, int idx) {
if (true) {
return false;
}
boolean retVal = false;
int c = chars[idx];
// are we dealing with a safe character
if (c == 39 || c == 45 || c == 8211 || c == 8212 || c == 145 || c == 146
|| c == 8216 || c == 8217) {
// if we are at end or start of array, then character is not good
if (idx == chars.length - 1 || idx == 0) {
return false;
}
// check to see if previous and next character are acceptable
if (Character.isLetterOrDigit(chars[idx + 1])
&& Character.isLetterOrDigit(chars[idx - 1])) {
return true;
}
}
return retVal;
}
public static String removePunctuation(String sentence) {
List<String> toks = fastTokenize(sentence, false);
return toks.toString().replace('[', ' ').replace(']', ' ')
.replace(',', ' ').replace(" ", " ");
}
public static ArrayList<String> fastTokenize(String txt, boolean retainPunc) {
ArrayList<String> tokens = new ArrayList<String>();
if (StringUtils.isEmpty(txt)) {
return tokens;
}
StringBuffer tok = new StringBuffer();
char[] chars = txt.toCharArray();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if (Character.isLetterOrDigit(c) || isSafePunc(chars, i)) {
tok.append(c);
} else if (Character.isWhitespace(c)) {
if (tok.length() > 0) {
tokens.add(tok.toString());
tok.setLength(0);
}
} else {
if (tok.length() > 0) {
tokens.add(tok.toString());
tok.setLength(0);
}
if (retainPunc) {
tokens.add("<punc>");
}
}
}
if (tok.length() > 0) {
tokens.add(tok.toString());
tok.setLength(0);
}
return tokens;
}
public static String convertTokensToString(ArrayList<String> tokens) {
StringBuffer b = new StringBuffer();
b.append("");
for (String s : tokens) {
b.append(s);
b.append(" ");
}
return b.toString().trim();
}
public static Hashtable<String, Integer> getAllBigrams(String[] tokens,
boolean retainPunc) {
// convert to ArrayList and pass on
ArrayList<String> f = new ArrayList<String>();
for (int i = 0; i < tokens.length; i++) {
f.add(tokens[i]);
}
return getAllBigrams(f, retainPunc);
}
public static Hashtable<String, Integer> getAllBigrams(
ArrayList<String> tokens, boolean retainPunc) {
Hashtable<String, Integer> bGramCandidates = new Hashtable<String, Integer>();
ArrayList<String> r = new ArrayList<String>();
for (int i = 0; i < tokens.size() - 1; i++) {
String b = (String) tokens.get(i) + " " + (String) tokens.get(i + 1);
b = b.toLowerCase();
// don't add punc tokens
if (b.indexOf("<punc>") != -1 && !retainPunc)
continue;
int freq = 1;
if (bGramCandidates.containsKey(b)) {
freq = ((Integer) bGramCandidates.get(b)).intValue() + 1;
}
bGramCandidates.put(b, new Integer(freq));
}
return bGramCandidates;
}
public static Hashtable<String, Float> getAllBigramsStopWord(
ArrayList<String> tokens, boolean retainPunc) {
Hashtable<String, Float> bGramCandidates = new Hashtable<String, Float>();
try {
ArrayList<String> r = new ArrayList<String>();
for (int i = 0; i < tokens.size() - 1; i++) {
String p1 = (String) tokens.get(i).toLowerCase();
String p2 = (String) tokens.get(i + 1).toLowerCase();
// check to see if stopword
/*
* if(StopList.getInstance().isStopWord(p1.trim()) ||
* StopList.getInstance().isStopWord(p2.trim())){ continue; }
*/
StringBuffer buf = new StringBuffer();
buf.append(p1);
buf.append(" ");
buf.append(p2);
String b = buf.toString().toLowerCase();
// don't add punc tokens
if (b.indexOf("<punc>") != -1 && !retainPunc)
continue;
float freq = 1;
if (bGramCandidates.containsKey(b)) {
freq = bGramCandidates.get(b) + 1;
}
bGramCandidates.put(b, freq);
}
} catch (Exception e) {
LOG.severe("Problem getting stoplist");
}
return bGramCandidates;
}
public static ArrayList<String> tokenizeAndStemWithPunctuation(String txt) {
// tokenize
ArrayList<String> tokens = fastTokenize(txt, true);
for (int i = 0; i < tokens.size(); i++) {
if (!tokens.get(i).equals("<punc>")) {
tokens.set(i, TextProcessor.stemTerm(tokens.get(i)));
}
}
return tokens;
}
public static String trimPunctuationFromStart(String text) {
try {
int start = 0;
int end = text.length() - 1;
// trim from the start
for (int i = 0; i < text.length(); i++) {
if (!isPunctuation(text.charAt(i)))
break;
start++;
}
if (start == text.length()) {
return "";
}
return text.substring(start, end + 1);
} catch (RuntimeException e) {
LOG.severe("RuntimeException " + e);
e.printStackTrace();
return "";
}
}
public static String trimPunctuation(String text) {
try {
int start = 0;
int end = text.length() - 1;
// trim from the start
for (int i = 0; i < text.length(); i++) {
if (!isPunctuation(text.charAt(i)))
break;
start++;
}
if (start == text.length()) {
return "";
}
// trim for the end
for (int i = text.length() - 1; i >= 0; i--) {
if (!isPunctuation(text.charAt(i)))
break;
end--;
}
return text.substring(start, end + 1);
} catch (RuntimeException e) {
LOG.severe("RuntimeException " + e);
return "";
}
}
public static boolean isPunctuation(char c) {
return !Character.isLetterOrDigit(c);
}
public static String stemAndClean(String token) {
token = token.trim();
token = token.toLowerCase();
if (token.length() == 0) {
return "";
}
if (isPunctuation(token.substring(token.length() - 1))) {
if (token.length() == 1) {
return token;
}
token = token.substring(0, token.length() - 1);
if (token.length() == 0) {
return "";
}
}
if (isPunctuation(token)) {
if (token.length() == 1) {
return token;
}
token = token.substring(1);
if (token.length() == 0) {
return "";
}
}
return new PorterStemmer().stem(token).toString();
}
public static String cleanToken(String token) {
token = token.trim();
// token = token.toLowerCase();
if (token.length() == 0) {
return "";
}
if (isPunctuation(token.substring(token.length() - 1))) {
if (token.length() == 1) {
return token;
}
token = token.substring(0, token.length() - 1);
if (token.length() == 0) {
return "";
}
}
if (isPunctuation(token)) {
if (token.length() == 1) {
return token;
}
token = token.substring(1);
if (token.length() == 0) {
return "";
}
}
return token;
}
public static boolean isAllNumbers(String str) {
return str.matches("^\\d*$");
}
private static boolean isPunctuation(String str) {
if (str.length() < 1) {
return false;
} else {
return str.substring(0, 1).matches("[^\\d\\w\\s]");
}
}
public static String stemTerm(String term) {
term = stripToken(term);
PorterStemmer st = new PorterStemmer();
return st.stem(term).toString();
}
public static String generateFingerPrint(String s) {
String hash = "";
if (s.length() > 0) {
MessageDigest md = null;
try {
md = MessageDigest.getInstance("SHA"); // step 2
} catch (NoSuchAlgorithmException e) {
LOG.severe("NoSuchAlgorithmException " + 2);
}
try {
md.update(s.getBytes("UTF-8")); // step 3
} catch (UnsupportedEncodingException e) {
LOG.severe("UnsupportedEncodingException " + e);
}
byte raw[] = md.digest();
hash = null; // (new BASE64Encoder()).encode(raw);
}
return hash;
}
public static String generateUrlSafeFingerPrint(String s) {
String signature = TextProcessor.generateFingerPrint(s);
return signature.replaceAll("[?/]", "+");
}
public static String generateFingerPrintForHistogram(String s)
throws Exception {
Hashtable tokenHash = new Hashtable();
// ArrayList tokens = TextProcessor.tokenizeWithPunctuation(s);
ArrayList tokens = TextProcessor.fastTokenize(s, true);
for (Object t : tokens) {
String tokenLower = ((String) (t)).toLowerCase();
if (tokenLower == "<punc>") {
continue;
}
if (tokenLower == "close_a") {
continue;
}
if (tokenLower == "open_a") {
continue;
}
String stemmedToken = TextProcessor.stemTerm(tokenLower);
if (tokenHash.containsKey(stemmedToken)) {
int freq = ((Integer) tokenHash.get(stemmedToken)).intValue();
freq++;
tokenHash.put(stemmedToken, new Integer(freq));
} else {
tokenHash.put(stemmedToken, new Integer(1));
}
}
// now we have histogram, lets write it out
String hashString = "";
Enumeration en = tokenHash.keys();
while (en.hasMoreElements()) {
String t = (String) en.nextElement();
int freq = (Integer) tokenHash.get(t);
hashString += t + freq;
}
// log.info(hashString);
String hash = "";
if (hashString.length() > 0) {
MessageDigest md = null;
try {
md = MessageDigest.getInstance("SHA"); // step 2
} catch (NoSuchAlgorithmException e) {
LOG.severe("NoSuchAlgorithmException " + e);
throw new Exception(e.getMessage());
}
try {
md.update(hashString.getBytes("UTF-8")); // step 3
} catch (UnsupportedEncodingException e) {
LOG.severe("UnsupportedEncodingException " + e);
throw new Exception(e.getMessage());
}
byte raw[] = md.digest();
hash = null; // (new BASE64Encoder()).encode(raw);
}
return hash;
}
public static String stripToken(String token) {
if (token.endsWith("\'s") || token.endsWith("�s")) {
token = token.substring(0, token.length() - 2);
}
return token;
}
public static HashMap<String, Integer> getUniqueTokenIndex(List<String> tokens) {
HashMap<String, Integer> m = new HashMap<String, Integer>();
for (String s : tokens) {
s = s.toLowerCase();
if (m.containsKey(s)) {
Integer f = m.get(s);
f++;
m.put(s, f);
} else {
m.put(s, 1);
}
}
return m;
}
public static String generateSummary(String txt, String title, int numChars,
boolean truncateInSentence) {
String finalSummary = "";
try {
String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST",
"GMT", "AM", " " };
txt = txt.replace(" | ", " ");
txt = txt.replace(" |", " ");
ArrayList<String> sentences = TextProcessor.splitToSentences(txt);
// System.out.println("Sentences are:");
StringBuffer sum = new StringBuffer();
int cnt = 0;
int lCnt = 0;
for (String s : sentences) {
cnt++;
// System.out.println(s + "\n");
s = trimSentence(s, title);
// see if sentence has a time in it
// boolean containsTime = s.co("[0-9]");
if (s.length() > 60 && !s.contains("By") && !s.contains("Page")
&& !s.contains(">>") && Character.isUpperCase(s.charAt(0))) {
// System.out.println("cleaned: " + s + "\n");
if (Math.abs(cnt - lCnt) != 1 && lCnt != 0) {
if (sum.toString().endsWith(".")) {
sum.append("..");
} else {
sum.append("...");
}
} else {
sum.append(" ");
}
sum.append(s.trim());
lCnt = cnt;
}
if (sum.length() > numChars) {
break;
}
}
finalSummary = sum.toString().trim();
if (truncateInSentence) {
finalSummary = truncateTextOnSpace(finalSummary, numChars);
int numPeriods = countTrailingPeriods(finalSummary);
if (numPeriods < 3 && finalSummary.length() > 0) {
for (int i = 0; i < 3 - numPeriods; i++) {
finalSummary += ".";
}
}
} else {
// trim final period
if (finalSummary.endsWith("..")) {
finalSummary = finalSummary.substring(0, finalSummary.length() - 2);
}
}
// check to see if we have anything, if not, return the fullcontent
if (finalSummary.trim().length() < 5) {
finalSummary = txt;
}
// see if have a punc in the first 30 chars
int highestIdx = -1;
int sIdx = Math.min(finalSummary.length() - 1, 45);
for (String p : puncChars) {
int idx = finalSummary.trim().substring(0, sIdx).lastIndexOf(p);
if (idx > highestIdx && idx < 45) {
highestIdx = idx + p.length();
}
}
if (highestIdx > -1) {
finalSummary = finalSummary.substring(highestIdx);
}
int closeParenIdx = finalSummary.indexOf(")");
int openParenIdx = finalSummary.indexOf("(");
// if(closeParenIdx < )
if (closeParenIdx != -1 && closeParenIdx < 15
&& (openParenIdx == -1 || openParenIdx > closeParenIdx)) {
finalSummary = finalSummary.substring(closeParenIdx + 1).trim();
}
finalSummary = trimPunctuationFromStart(finalSummary);
// check to see if we have anything, if not, return the fullcontent
if (finalSummary.trim().length() < 5) {
finalSummary = txt;
}
} catch (Exception e) {
LOG.severe("Problem forming summary for: " + txt);
LOG.severe("Using full text for the summary" + e);
finalSummary = txt;
}
return finalSummary.trim();
}
public static String truncateTextOnSpace(String txt, int numChars) {
String retVal = txt;
if (txt.length() > numChars) {
String temp = txt.substring(0, numChars);
// loop backwards to find last space
int lastSpace = -1;
for (int i = temp.length() - 1; i >= 0; i--) {
if (Character.isWhitespace(temp.charAt(i))) {
lastSpace = i;
break;
}
}
if (lastSpace != -1) {
retVal = temp.substring(0, lastSpace);
}
}
return retVal;
}
public static int countTrailingPeriods(String txt) {
int retVal = 0;
if (txt.length() > 0) {
for (int i = txt.length() - 1; i >= 0; i--) {
if (txt.valueOf(txt.charAt(i)).equals(".")) {
retVal++;
} else {
break;
}
}
}
return retVal;
}
public static String trimSentence(String txt, String title) {
// iterate backwards looking for the first all cap word..
int numCapWords = 0;
int firstIdx = -1;
String cleaned = txt;
for (int i = txt.length() - 1; i >= 0; i--) {
if (Character.isUpperCase(txt.charAt(i))) {
if (numCapWords == 0) {
firstIdx = i;
}
numCapWords++;
} else {
numCapWords = 0;
firstIdx = -1;
}
if (numCapWords > 3) {
if (firstIdx != -1) {
cleaned = txt.substring(firstIdx + 1);
break;
}
}
}
txt = cleaned;
// now scrub the start of the string
int idx = 0;
for (int i = 0; i < txt.length() - 1; i++) {
if (!Character.isUpperCase(txt.charAt(i))) {
idx++;
} else {
break;
}
}
txt = txt.substring(idx);
// scrub the title
if (title.trim().length() > 0 && txt.indexOf(title.trim()) != -1) {
txt = txt
.substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
}
// scrub before first -
if (txt.indexOf(" � ") != -1) {
txt = txt.substring(txt.indexOf(" � ") + 3);
}
if (txt.indexOf(" - ") != -1) {
txt = txt.substring(txt.indexOf(" - ") + 3);
}
if (txt.indexOf("del.icio.us") != -1) {
txt = txt.substring(txt.indexOf("del.icio.us") + "del.icio.us".length());
}
return txt;
}
public static String removeStopListedTermsAndPhrases(String txt) {
HashSet<String> stopPhrases = null;
/*
* try{ StopList sl = StopList.getInstance(); stopPhrases =
* sl.getStopListMap("EXTRACTOR"); }catch(Exception e){
* log.severe("Problem loading stoplists"); }
*/
// segment into top 20% and bottom 20%
int startIdx = txt.length() / 4;
String startPart = txt.substring(0, startIdx);
int endIdx = txt.length() - (txt.length() / 4);
String endPart = txt.substring(endIdx, txt.length());
String middlePart = txt.substring(startIdx, endIdx);
// iterate through the stop words and start removing
for (Object o : stopPhrases.toArray()) {
String p = (String) o;
int idx = startPart.indexOf(p);
if (idx != -1) {
startPart = startPart.substring(idx + p.length());
}
idx = endPart.indexOf(p);
if (idx != -1) {
endPart = endPart.substring(0, idx);
}
}
// combine these sections
String retVal = startPart + middlePart + endPart;
return retVal.trim();
}
public static List<String> extractUrlsFromText(String txt) {
List<String> urls = new ArrayList<String>();
// tokenize and iterate
String[] tokens = txt.split(" ");
for (String t : tokens) {
if (t.startsWith("http://")) {
if (!urls.contains(t)) {
urls.add(t);
}
}
}
return urls;
}
public static List<String> findCommonTokens(List<String> segments) {
List<String> commonTokens = new ArrayList<String>();
if (segments.size() > 1) {
List<String> allTokens = new ArrayList<String>();
for (String s : segments) {
String[] tks = s.split(" ");
List<String> tokens = Arrays.asList(tks);
HashMap<String, Integer> ut = TextProcessor.getUniqueTokenIndex(tokens);
for (String t : ut.keySet()) {
allTokens.add(t);
}
}
HashMap<String, Integer> uniqueTokens = TextProcessor
.getUniqueTokenIndex(allTokens);
for (String t : uniqueTokens.keySet()) {
Integer freq = uniqueTokens.get(t);
if (freq.intValue() == segments.size()) {
commonTokens.add(t);
}
}
}
return commonTokens;
}
public static int numTokensInString(String txt) {
int retVal = 0;
if (txt != null && txt.trim().length() > 0) {
retVal = txt.trim().split(" ").length;
}
return retVal;
}
public static String defragmentText(String str) {
if (StringUtils.isNotEmpty(str)) {
str = str.replaceAll("&nbsp;", " "); // replace &nbsp; with spaces
str = str.replaceAll("<br />", "<br/>"); // normalize break tag
str = str.replaceAll("\\s+", " "); // replace multiple white spaces with
// single space
// remove empty paragraphs - would be nice to have single regex for this
str = str.replaceAll("<p> </p>", "");
str = str.replaceAll("<p></p>", "");
str = str.replaceAll("<p/>", "");
str = str.replaceAll("<strong><br/></strong>", "<br/>"); // escape strong
// tag if
// surrounding
// break tag
str = str.replaceAll("(<br/>)+", "<br/><br/>"); // replace multiple break
// tags with 2 break tags
str = str.replaceAll("<p><br/>", "<p>"); // replace paragraph followed by
// break with just a paragraph
// element
}
return str;
}
}