opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.textsimilarity;

 import java.io.UnsupportedEncodingException;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Logger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import opennlp.tools.stemmer.PorterStemmer;
 import opennlp.tools.similarity.apps.utils.Pair;

 import org.apache.commons.lang.StringUtils;

 public class TextProcessor {

   private static final Logger LOG = Logger
       .getLogger("opennlp.tools.textsimilarity.TextProcessor");

   static final String[] abbrevs = { "mr.", "mrs.", "sen.", "rep.", "gov.",
       "miss.", "dr.", "oct.", "nov.", "jan.", "feb.", "mar.", "apr.", "may",
       "jun.", "jul.", "aug.", "sept." };

   public static void removeCommonPhrases(ArrayList<String> segments) {

     ArrayList<Pair<List<String>, Map<String, HashSet<Integer>>>> docs = new ArrayList<Pair<List<String>, Map<String, HashSet<Integer>>>>();
     // tokenize each segment
     for (int i = 0; i < segments.size(); i++) {
       String s = segments.get(i);

       Pair<List<String>, Map<String, HashSet<Integer>>> tokPos = buildTokenPositions(s);
       docs.add(tokPos);
     }

     HashMap<String, HashSet<Integer>> commonSegments = new HashMap<String, HashSet<Integer>>();
     // now we have all documents and the token positions
     for (int i = 0; i < docs.size(); i++) {
       Pair<List<String>, Map<String, HashSet<Integer>>> objA = docs.get(i);
       for (int k = i + 1; k < docs.size(); k++) {
         Pair<List<String>, Map<String, HashSet<Integer>>> objB = docs.get(k);
         HashSet<String> segs = extractCommonSegments(objA, objB, 4);
         for (String seg : segs) {
           // System.out.println(seg);
           if (commonSegments.containsKey(seg)) {
             HashSet<Integer> docIds = commonSegments.get(seg);
             docIds.add(i);
             docIds.add(k);
             commonSegments.put(seg, docIds);
           } else {
             HashSet<Integer> docIds = new HashSet<Integer>();
             docIds.add(i);
             docIds.add(k);
             commonSegments.put(seg, docIds); // set frequency to two, since both
                                              // these docs contain this
             // segment
           }
         }
       }
     }

     System.out.println(segments.size() + " docs");
     // now we have the segments and their frequencies
     for (String seg : commonSegments.keySet()) {
       System.out.println(seg + ":" + commonSegments.get(seg).size());
     }
   }

   public static HashSet<String> extractCommonSegments(String s1, String s2,
       Integer segSize) {
     Pair<List<String>, Map<String, HashSet<Integer>>> o1 = buildTokenPositions(s1);
     Pair<List<String>, Map<String, HashSet<Integer>>> o2 = buildTokenPositions(s2);

     return extractCommonSegments(o1, o2, segSize);
   }

   private static HashSet<String> extractCommonSegments(
       Pair<List<String>, Map<String, HashSet<Integer>>> objA,
       Pair<List<String>, Map<String, HashSet<Integer>>> objB, Integer segSize) {

     HashSet<String> commonSegments = new HashSet<String>();

     List<String> tokensA = objA.getFirst();

     Map<String, HashSet<Integer>> tokenPosB = objB.getSecond();

     HashSet<Integer> lastPositions = null;
     int segLength = 1;
     StringBuffer segmentStr = new StringBuffer();

     for (int i = 0; i < tokensA.size(); i++) {
       String token = tokensA.get(i);
       HashSet<Integer> positions = null;
       // if ((positions = tokenPosB.get(token)) != null &&
       // !token.equals("<punc>") &&
       // !StopList.getInstance().isStopWord(token) && token.length()>1) {
       if ((positions = tokenPosB.get(token)) != null) {
         // we have a list of positions
         if (lastPositions != null) {
           // see if there is overlap in positions
           if (hasNextPosition(lastPositions, positions)) {
             segLength++;

             commonSegments.remove(segmentStr.toString().trim());
             segmentStr.append(" ");
             segmentStr.append(token);
             if (StringUtils.countMatches(segmentStr.toString(), " ") >= segSize) {
               commonSegments.add(segmentStr.toString().trim());
             }
             lastPositions = positions;

           } else {
             // did not find segment, reset
             segLength = 1;
             segmentStr.setLength(0);
             lastPositions = null;
           }
         } else {
           lastPositions = positions;
           segmentStr.append(" ");
           segmentStr.append(token);
         }
       } else {
         // did not find segment, reset
         segLength = 1;
         segmentStr.setLength(0);
         lastPositions = null;
       }
     }

     return commonSegments;
   }

   private static boolean hasNextPosition(HashSet<Integer> positionsA,
       HashSet<Integer> positionsB) {
     boolean retVal = false;
     for (Integer pos : positionsA) {
       Integer nextIndex = pos + 1;
       if (positionsB.contains(nextIndex)) {
         retVal = true;
         break;
       }
     }
     return retVal;
   }

   public static Pair<List<String>, Map<String, HashSet<Integer>>> buildTokenPositions(
       String s) {

     String[] toks = StringUtils.split(s);
     List<String> list = Arrays.asList(toks);
     ArrayList<String> tokens = new ArrayList<String>(list);

     Map<String, HashSet<Integer>> theMap = new HashMap<String, HashSet<Integer>>();
     for (int i = 0; i < tokens.size(); i++) {
       HashSet<Integer> pos = null;
       String token = tokens.get(i);
       if ((pos = theMap.get(token)) != null) {
         pos.add(i);
       } else {
         pos = new HashSet<Integer>();
         pos.add(i);
       }
       theMap.put(token, pos);
     }

     return new Pair<List<String>, Map<String, HashSet<Integer>>>(tokens, theMap);
   }

   public static boolean isStringAllPunc(String token) {

     for (int i = 0; i < token.length(); i++) {
       if (Character.isLetterOrDigit(token.charAt(i))) {
         return false;
       }
     }
     return true;
   }

   /**
    * Splits input text into sentences.
    *
    * @param txt
    *          Input text
    * @return List of sentences
    */

   public static ArrayList<String> splitToSentences(String text) {

     ArrayList<String> sentences = new ArrayList<String>();
     if (text.trim().length() > 0) {
       String s = "[\\?!\\.]\"?[\\s+][A-Z0-9i]";
       text += " XOXOX.";
       Pattern p = Pattern.compile(s, Pattern.MULTILINE);
       Matcher m = p.matcher(text);
       int idx = 0;
       String cand = "";

       // while(m.find()){
       // System.out.println(m.group());
       // }

       while (m.find()) {
         cand += " " + text.substring(idx, m.end() - 1).trim();
         boolean hasAbbrev = false;

         for (int i = 0; i < abbrevs.length; i++) {
           if (cand.toLowerCase().endsWith(abbrevs[i])) {
             hasAbbrev = true;
             break;
           }
         }

         if (!hasAbbrev) {
           sentences.add(cand.trim());
           cand = "";
         }
         idx = m.end() - 1;
       }

       if (idx < text.length()) {
         sentences.add(text.substring(idx).trim());
       }
       if (sentences.size() > 0) {
         sentences.set(sentences.size() - 1, sentences.get(sentences.size() - 1)
             .replace(" XOXOX.", ""));
       }
     }
     return sentences;
   }

   private static boolean isSafePunc(char[] chars, int idx) {

     if (true) {
       return false;
     }

     boolean retVal = false;
     int c = chars[idx];

     // are we dealing with a safe character
     if (c == 39 || c == 45 || c == 8211 || c == 8212 || c == 145 || c == 146
         || c == 8216 || c == 8217) {
       // if we are at end or start of array, then character is not good
       if (idx == chars.length - 1 || idx == 0) {
         return false;
       }

       // check to see if previous and next character are acceptable
       if (Character.isLetterOrDigit(chars[idx + 1])
           && Character.isLetterOrDigit(chars[idx - 1])) {
         return true;
       }
     }

     return retVal;
   }

   public static String removePunctuation(String sentence) {
     List<String> toks = fastTokenize(sentence, false);
     return toks.toString().replace('[', ' ').replace(']', ' ')
         .replace(',', ' ').replace("  ", " ");
   }

   public static ArrayList<String> fastTokenize(String txt, boolean retainPunc) {
     ArrayList<String> tokens = new ArrayList<String>();
     if (StringUtils.isEmpty(txt)) {
       return tokens;
     }

     StringBuffer tok = new StringBuffer();
     char[] chars = txt.toCharArray();

     for (int i = 0; i < chars.length; i++) {
       char c = chars[i];
       if (Character.isLetterOrDigit(c) || isSafePunc(chars, i)) {
         tok.append(c);
       } else if (Character.isWhitespace(c)) {
         if (tok.length() > 0) {
           tokens.add(tok.toString());
           tok.setLength(0);
         }
       } else {
         if (tok.length() > 0) {
           tokens.add(tok.toString());
           tok.setLength(0);
         }
         if (retainPunc) {
           tokens.add("<punc>");
         }
       }
     }

     if (tok.length() > 0) {
       tokens.add(tok.toString());
       tok.setLength(0);
     }
     return tokens;
   }

   public static String convertTokensToString(ArrayList<String> tokens) {
     StringBuffer b = new StringBuffer();
     b.append("");
     for (String s : tokens) {
       b.append(s);
       b.append(" ");
     }

     return b.toString().trim();
   }

   public static Hashtable<String, Integer> getAllBigrams(String[] tokens,
       boolean retainPunc) {
     // convert to ArrayList and pass on
     ArrayList<String> f = new ArrayList<String>();
     for (int i = 0; i < tokens.length; i++) {
       f.add(tokens[i]);
     }
     return getAllBigrams(f, retainPunc);
   }

   public static Hashtable<String, Integer> getAllBigrams(
       ArrayList<String> tokens, boolean retainPunc) {
     Hashtable<String, Integer> bGramCandidates = new Hashtable<String, Integer>();
     ArrayList<String> r = new ArrayList<String>();
     for (int i = 0; i < tokens.size() - 1; i++) {
       String b = (String) tokens.get(i) + " " + (String) tokens.get(i + 1);
       b = b.toLowerCase();
       // don't add punc tokens
       if (b.indexOf("<punc>") != -1 && !retainPunc)
         continue;

       int freq = 1;
       if (bGramCandidates.containsKey(b)) {
         freq = ((Integer) bGramCandidates.get(b)).intValue() + 1;
       }
       bGramCandidates.put(b, new Integer(freq));
     }
     return bGramCandidates;
   }

   public static Hashtable<String, Float> getAllBigramsStopWord(
       ArrayList<String> tokens, boolean retainPunc) {

     Hashtable<String, Float> bGramCandidates = new Hashtable<String, Float>();
     try {
       ArrayList<String> r = new ArrayList<String>();
       for (int i = 0; i < tokens.size() - 1; i++) {
         String p1 = (String) tokens.get(i).toLowerCase();
         String p2 = (String) tokens.get(i + 1).toLowerCase();
         // check to see if stopword
         /*
          * if(StopList.getInstance().isStopWord(p1.trim()) ||
          * StopList.getInstance().isStopWord(p2.trim())){ continue; }
          */

         StringBuffer buf = new StringBuffer();
         buf.append(p1);
         buf.append(" ");
         buf.append(p2);
         String b = buf.toString().toLowerCase();
         // don't add punc tokens
         if (b.indexOf("<punc>") != -1 && !retainPunc)
           continue;

         float freq = 1;
         if (bGramCandidates.containsKey(b)) {
           freq = bGramCandidates.get(b) + 1;
         }
         bGramCandidates.put(b, freq);
       }
     } catch (Exception e) {
       LOG.severe("Problem getting stoplist");
     }

     return bGramCandidates;
   }

   public static ArrayList<String> tokenizeAndStemWithPunctuation(String txt) {
     // tokenize
     ArrayList<String> tokens = fastTokenize(txt, true);
     for (int i = 0; i < tokens.size(); i++) {
       if (!tokens.get(i).equals("<punc>")) {
         tokens.set(i, TextProcessor.stemTerm(tokens.get(i)));
       }
     }

     return tokens;
   }

   public static String trimPunctuationFromStart(String text) {
     try {
       int start = 0;
       int end = text.length() - 1;
       // trim from the start
       for (int i = 0; i < text.length(); i++) {
         if (!isPunctuation(text.charAt(i)))
           break;
         start++;
       }
       if (start == text.length()) {
         return "";
       }

       return text.substring(start, end + 1);
     } catch (RuntimeException e) {
       LOG.severe("RuntimeException " + e);
       e.printStackTrace();
       return "";
     }
   }

   public static String trimPunctuation(String text) {
     try {
       int start = 0;
       int end = text.length() - 1;
       // trim from the start
       for (int i = 0; i < text.length(); i++) {
         if (!isPunctuation(text.charAt(i)))
           break;
         start++;
       }
       if (start == text.length()) {
         return "";
       }
       // trim for the end
       for (int i = text.length() - 1; i >= 0; i--) {
         if (!isPunctuation(text.charAt(i)))
           break;
         end--;
       }

       return text.substring(start, end + 1);
     } catch (RuntimeException e) {
       LOG.severe("RuntimeException " + e);
       return "";
     }
   }

   public static boolean isPunctuation(char c) {
     return !Character.isLetterOrDigit(c);
   }

   public static String stemAndClean(String token) {
     token = token.trim();
     token = token.toLowerCase();
     if (token.length() == 0) {
       return "";
     }
     if (isPunctuation(token.substring(token.length() - 1))) {
       if (token.length() == 1) {
         return token;
       }
       token = token.substring(0, token.length() - 1);
       if (token.length() == 0) {
         return "";
       }
     }
     if (isPunctuation(token)) {
       if (token.length() == 1) {
         return token;
       }
       token = token.substring(1);
       if (token.length() == 0) {
         return "";
       }
     }

     return new PorterStemmer().stem(token).toString();
   }

   public static String cleanToken(String token) {
     token = token.trim();
     // token = token.toLowerCase();
     if (token.length() == 0) {
       return "";
     }
     if (isPunctuation(token.substring(token.length() - 1))) {
       if (token.length() == 1) {
         return token;
       }
       token = token.substring(0, token.length() - 1);
       if (token.length() == 0) {
         return "";
       }
     }
     if (isPunctuation(token)) {
       if (token.length() == 1) {
         return token;
       }
       token = token.substring(1);
       if (token.length() == 0) {
         return "";
       }
     }

     return token;
   }

   public static boolean isAllNumbers(String str) {
     return str.matches("^\\d*$");
   }

   private static boolean isPunctuation(String str) {
     if (str.length() < 1) {
       return false;
     } else {
       return str.substring(0, 1).matches("[^\\d\\w\\s]");
     }
   }

   public static String stemTerm(String term) {
     term = stripToken(term);
     PorterStemmer st = new PorterStemmer();

     return st.stem(term).toString();
   }

   public static String generateFingerPrint(String s) {
     String hash = "";

     if (s.length() > 0) {
       MessageDigest md = null;
       try {
         md = MessageDigest.getInstance("SHA"); // step 2
       } catch (NoSuchAlgorithmException e) {
         LOG.severe("NoSuchAlgorithmException " + 2);
       }
       try {
         md.update(s.getBytes("UTF-8")); // step 3
       } catch (UnsupportedEncodingException e) {
         LOG.severe("UnsupportedEncodingException " + e);
       }
       byte raw[] = md.digest();
       hash = null; // (new BASE64Encoder()).encode(raw);
     }
     return hash;
   }

   public static String generateUrlSafeFingerPrint(String s) {
     String signature = TextProcessor.generateFingerPrint(s);
     return signature.replaceAll("[?/]", "+");
   }

   public static String generateFingerPrintForHistogram(String s)
       throws Exception {

     Hashtable tokenHash = new Hashtable();
     // ArrayList tokens = TextProcessor.tokenizeWithPunctuation(s);
     ArrayList tokens = TextProcessor.fastTokenize(s, true);

     for (Object t : tokens) {
       String tokenLower = ((String) (t)).toLowerCase();

       if (tokenLower == "<punc>") {
         continue;
       }
       if (tokenLower == "close_a") {
         continue;
       }
       if (tokenLower == "open_a") {
         continue;
       }
       String stemmedToken = TextProcessor.stemTerm(tokenLower);

       if (tokenHash.containsKey(stemmedToken)) {
         int freq = ((Integer) tokenHash.get(stemmedToken)).intValue();
         freq++;
         tokenHash.put(stemmedToken, new Integer(freq));
       } else {
         tokenHash.put(stemmedToken, new Integer(1));
       }
     }

     // now we have histogram, lets write it out
     String hashString = "";
     Enumeration en = tokenHash.keys();
     while (en.hasMoreElements()) {
       String t = (String) en.nextElement();
       int freq = (Integer) tokenHash.get(t);
       hashString += t + freq;
     }

     // log.info(hashString);
     String hash = "";

     if (hashString.length() > 0) {
       MessageDigest md = null;
       try {
         md = MessageDigest.getInstance("SHA"); // step 2
       } catch (NoSuchAlgorithmException e) {
         LOG.severe("NoSuchAlgorithmException " + e);
         throw new Exception(e.getMessage());
       }
       try {
         md.update(hashString.getBytes("UTF-8")); // step 3
       } catch (UnsupportedEncodingException e) {
         LOG.severe("UnsupportedEncodingException " + e);
         throw new Exception(e.getMessage());
       }
       byte raw[] = md.digest();
       hash = null; // (new BASE64Encoder()).encode(raw);
     }
     return hash;
   }

   public static String stripToken(String token) {
     if (token.endsWith("\'s") || token.endsWith("�s")) {
       token = token.substring(0, token.length() - 2);
     }
     return token;
   }

   public static HashMap<String, Integer> getUniqueTokenIndex(List<String> tokens) {
     HashMap<String, Integer> m = new HashMap<String, Integer>();

     for (String s : tokens) {
       s = s.toLowerCase();
       if (m.containsKey(s)) {
         Integer f = m.get(s);
         f++;
         m.put(s, f);
       } else {
         m.put(s, 1);
       }
     }

     return m;

   }

   public static String generateSummary(String txt, String title, int numChars,
       boolean truncateInSentence) {
     String finalSummary = "";

     try {

       String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST",
           "GMT", "AM", "  " };

       txt = txt.replace(" | ", " ");
       txt = txt.replace(" |", " ");
       ArrayList<String> sentences = TextProcessor.splitToSentences(txt);
       // System.out.println("Sentences are:");
       StringBuffer sum = new StringBuffer();
       int cnt = 0;
       int lCnt = 0;
       for (String s : sentences) {
         cnt++;
         // System.out.println(s + "\n");
         s = trimSentence(s, title);
         // see if sentence has a time in it
         // boolean containsTime = s.co("[0-9]");
         if (s.length() > 60 && !s.contains("By") && !s.contains("Page")
             && !s.contains(">>") && Character.isUpperCase(s.charAt(0))) {
           // System.out.println("cleaned: " + s + "\n");
           if (Math.abs(cnt - lCnt) != 1 && lCnt != 0) {

             if (sum.toString().endsWith(".")) {
               sum.append("..");
             } else {
               sum.append("...");
             }
           } else {
             sum.append(" ");
           }
           sum.append(s.trim());
           lCnt = cnt;
         }
         if (sum.length() > numChars) {
           break;
         }
       }

       finalSummary = sum.toString().trim();

       if (truncateInSentence) {
         finalSummary = truncateTextOnSpace(finalSummary, numChars);
         int numPeriods = countTrailingPeriods(finalSummary);

         if (numPeriods < 3 && finalSummary.length() > 0) {
           for (int i = 0; i < 3 - numPeriods; i++) {
             finalSummary += ".";
           }
         }
       } else {
         // trim final period
         if (finalSummary.endsWith("..")) {
           finalSummary = finalSummary.substring(0, finalSummary.length() - 2);
         }
       }
       // check to see if we have anything, if not, return the fullcontent
       if (finalSummary.trim().length() < 5) {
         finalSummary = txt;
       }
       // see if have a punc in the first 30 chars
       int highestIdx = -1;
       int sIdx = Math.min(finalSummary.length() - 1, 45);
       for (String p : puncChars) {
         int idx = finalSummary.trim().substring(0, sIdx).lastIndexOf(p);
         if (idx > highestIdx && idx < 45) {
           highestIdx = idx + p.length();
         }
       }

       if (highestIdx > -1) {
         finalSummary = finalSummary.substring(highestIdx);
       }

       int closeParenIdx = finalSummary.indexOf(")");
       int openParenIdx = finalSummary.indexOf("(");
       // if(closeParenIdx < )
       if (closeParenIdx != -1 && closeParenIdx < 15
           && (openParenIdx == -1 || openParenIdx > closeParenIdx)) {
         finalSummary = finalSummary.substring(closeParenIdx + 1).trim();
       }

       finalSummary = trimPunctuationFromStart(finalSummary);

       // check to see if we have anything, if not, return the fullcontent
       if (finalSummary.trim().length() < 5) {
         finalSummary = txt;
       }

     } catch (Exception e) {
       LOG.severe("Problem forming summary for: " + txt);
       LOG.severe("Using full text for the summary" + e);
       finalSummary = txt;
     }

     return finalSummary.trim();
   }

   public static String truncateTextOnSpace(String txt, int numChars) {
     String retVal = txt;
     if (txt.length() > numChars) {
       String temp = txt.substring(0, numChars);
       // loop backwards to find last space
       int lastSpace = -1;
       for (int i = temp.length() - 1; i >= 0; i--) {
         if (Character.isWhitespace(temp.charAt(i))) {
           lastSpace = i;
           break;
         }
       }
       if (lastSpace != -1) {
         retVal = temp.substring(0, lastSpace);
       }
     }
     return retVal;
   }

   public static int countTrailingPeriods(String txt) {
     int retVal = 0;
     if (txt.length() > 0) {
       for (int i = txt.length() - 1; i >= 0; i--) {
         if (txt.valueOf(txt.charAt(i)).equals(".")) {
           retVal++;
         } else {
           break;
         }
       }
     }
     return retVal;
   }

   public static String trimSentence(String txt, String title) {

     // iterate backwards looking for the first all cap word..
     int numCapWords = 0;
     int firstIdx = -1;
     String cleaned = txt;
     for (int i = txt.length() - 1; i >= 0; i--) {
       if (Character.isUpperCase(txt.charAt(i))) {
         if (numCapWords == 0) {
           firstIdx = i;
         }
         numCapWords++;
       } else {
         numCapWords = 0;
         firstIdx = -1;
       }
       if (numCapWords > 3) {
         if (firstIdx != -1) {
           cleaned = txt.substring(firstIdx + 1);
           break;
         }
       }
     }

     txt = cleaned;

     // now scrub the start of the string
     int idx = 0;
     for (int i = 0; i < txt.length() - 1; i++) {
       if (!Character.isUpperCase(txt.charAt(i))) {
         idx++;
       } else {
         break;
       }
     }
     txt = txt.substring(idx);

     // scrub the title
     if (title.trim().length() > 0 && txt.indexOf(title.trim()) != -1) {
       txt = txt
           .substring(txt.indexOf(title.trim()) + title.trim().length() - 1);
     }

     // scrub before first -
     if (txt.indexOf(" � ") != -1) {
       txt = txt.substring(txt.indexOf(" � ") + 3);
     }
     if (txt.indexOf(" - ") != -1) {
       txt = txt.substring(txt.indexOf(" - ") + 3);
     }
     if (txt.indexOf("del.icio.us") != -1) {
       txt = txt.substring(txt.indexOf("del.icio.us") + "del.icio.us".length());
     }

     return txt;
   }

   public static String removeStopListedTermsAndPhrases(String txt) {
     HashSet<String> stopPhrases = null;
     /*
      * try{ StopList sl = StopList.getInstance(); stopPhrases =
      * sl.getStopListMap("EXTRACTOR"); }catch(Exception e){
      * log.severe("Problem loading stoplists"); }
      */
     // segment into top 20% and bottom 20%
     int startIdx = txt.length() / 4;
     String startPart = txt.substring(0, startIdx);

     int endIdx = txt.length() - (txt.length() / 4);
     String endPart = txt.substring(endIdx, txt.length());

     String middlePart = txt.substring(startIdx, endIdx);

     // iterate through the stop words and start removing
     for (Object o : stopPhrases.toArray()) {
       String p = (String) o;
       int idx = startPart.indexOf(p);
       if (idx != -1) {
         startPart = startPart.substring(idx + p.length());
       }
       idx = endPart.indexOf(p);
       if (idx != -1) {
         endPart = endPart.substring(0, idx);
       }
     }

     // combine these sections
     String retVal = startPart + middlePart + endPart;
     return retVal.trim();
   }

   public static List<String> extractUrlsFromText(String txt) {
     List<String> urls = new ArrayList<String>();
     // tokenize and iterate
     String[] tokens = txt.split(" ");
     for (String t : tokens) {
       if (t.startsWith("http://")) {
         if (!urls.contains(t)) {
           urls.add(t);
         }
       }
     }

     return urls;
   }

   public static List<String> findCommonTokens(List<String> segments) {
     List<String> commonTokens = new ArrayList<String>();

     if (segments.size() > 1) {
       List<String> allTokens = new ArrayList<String>();
       for (String s : segments) {
         String[] tks = s.split(" ");
         List<String> tokens = Arrays.asList(tks);
         HashMap<String, Integer> ut = TextProcessor.getUniqueTokenIndex(tokens);
         for (String t : ut.keySet()) {
           allTokens.add(t);
         }
       }
       HashMap<String, Integer> uniqueTokens = TextProcessor
           .getUniqueTokenIndex(allTokens);
       for (String t : uniqueTokens.keySet()) {
         Integer freq = uniqueTokens.get(t);
         if (freq.intValue() == segments.size()) {
           commonTokens.add(t);
         }
       }
     }

     return commonTokens;
   }

   public static int numTokensInString(String txt) {
     int retVal = 0;
     if (txt != null && txt.trim().length() > 0) {
       retVal = txt.trim().split(" ").length;
     }
     return retVal;
   }

   public static String defragmentText(String str) {

     if (StringUtils.isNotEmpty(str)) {
       str = str.replaceAll("&nbsp;", " "); // replace &nbsp; with spaces
       str = str.replaceAll("<br />", "<br/>"); // normalize break tag
       str = str.replaceAll("\\s+", " "); // replace multiple white spaces with
                                          // single space

       // remove empty paragraphs - would be nice to have single regex for this
       str = str.replaceAll("<p> </p>", "");
       str = str.replaceAll("<p></p>", "");
       str = str.replaceAll("<p/>", "");

       str = str.replaceAll("<strong><br/></strong>", "<br/>"); // escape strong
                                                                // tag if
                                                                // surrounding
                                                                // break tag
       str = str.replaceAll("(<br/>)+", "<br/><br/>"); // replace multiple break
                                                       // tags with 2 break tags
       str = str.replaceAll("<p><br/>", "<p>"); // replace paragraph followed by
                                                // break with just a paragraph
       // element
     }

     return str;
   }
 }