blob: ad0f7911af9f9c0ccfc12390ffe1134995cb1500 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parse_thicket.opinion_processor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import opennlp.tools.stemmer.PStemmer;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class StopList {
private static StopList m_StopList = null;
private static Hashtable<String, HashSet<String>> m_stopHash = new Hashtable<String, HashSet<String>>();
public static final Log logger = LogFactory.getLog(StopList.class);
private static final String DEFAULT_STOPLIST = "STANDARD";
public static String resourceDir =null;
private static PStemmer stemmer = new PStemmer();
static {
synchronized (StopList.class) {
try {
LoadStopList();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* Get the StopList singleton instance.
*
* @return The StopList
*/
static public synchronized StopList getInstance() {
if (m_StopList == null) {
m_StopList = new StopList();
try {
m_StopList.LoadStopList();
} catch (Exception e) {
}
}
return m_StopList;
}
static public synchronized StopList getInstance(String dir) {
resourceDir = dir;
if (m_StopList == null) {
m_StopList = new StopList();
try {
m_StopList.LoadStopList();
} catch (Exception e) {
}
}
return m_StopList;
}
private static void LoadStopList() throws IOException {
File dir = new File(resourceDir + "/maps");
String[] children = dir.list();
if (children == null) {
System.err.println("Problem reading Stop Lists!");
} else {
for (int i = 0; i < children.length; i++) {
String fn = children[i];
if (fn.endsWith(".vcb")) {
String fileName = resourceDir + "/maps/" + fn;
File f = new File(fileName);
loadStopListFile(f);
}
}
}
}
private static void loadStopListFile(File f) throws FileNotFoundException {
FileReader fileReader = new FileReader(f);
BufferedReader in = new BufferedReader(fileReader);
String str = new String();
boolean fLine = true;
HashSet<String> t = new HashSet<String>();
String listName = "";
try {
while ((str = in.readLine()) != null) {
if (fLine && str.length() > 0) {
fLine = false;
listName = str;
} else {
t.add(str);
}
}
} catch (IOException ioe) {
} finally {
try {
if (in != null) {
in.close();
}
if (fileReader != null) {
fileReader.close();
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
if (listName.length() > 0) {
HashSet<String> l = m_stopHash.get(listName);
if (l != null) {
synchronized (l) {
m_stopHash.put(listName, t);
}
} else {
m_stopHash.put(listName, t);
}
}
}
/**
* Is the given word in the stop words list? Uses the defaut "STANDARD"
* stoplist
*
* @param str
* The word to check
* @return is a stop word
*/
public static boolean isStopWord(String str) {
boolean retVal = false;
if (m_stopHash.containsKey(DEFAULT_STOPLIST))
retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
return retVal;
}
public static boolean isFirstName(String str) {
boolean retVal = false;
if (m_stopHash.containsKey("FIRST_NAMES"))
retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
return retVal;
}
public String getRandomFirstName() {
HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
int indexRand = (int) (Math.random() * new Float(firstNames.size()));
Iterator iter = firstNames.iterator();
for (int i = 0; i < indexRand; i++) {
iter.next();
}
return ((String) iter.next()).toLowerCase();
}
public static boolean isCommonWord(String str) {
if (str == null)
return true;
String stemmed="";
try {
stemmed = stemmer.stem(str).toLowerCase();
} catch (Exception e) {
//stemming exceptions are not informative, jiust ignore wthis word
//e.printStackTrace();
}
boolean retVal = false;
if (m_stopHash.containsKey("ENG_DICT"))
retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
return retVal;
}
public boolean isCommonEventWord(String str) {
if (str == null)
return true;
boolean retVal = false;
try {
String stemmed = str.toLowerCase();
if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(
stemmed);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return retVal;
}
/**
* Is the given word in the stop words list provided?
*
* @param str
* The word to check
* @param stop_list
* the name of the stoplist to check against
* @return is a stop word
*/
public static boolean isStopWord(String str, String stop_list) {
boolean retVal = false;
if (m_stopHash.containsKey(stop_list))
retVal = m_stopHash.get(stop_list).contains(str);
return retVal;
}
public boolean isStopWordAll(String str) {
return isStopWord(str);
}
public HashSet<String> getStopListMap(String name) {
return m_stopHash.get(name);
}
public static List<List<String>> preFilterCommonEnglishExpressions(
List<String> userLikes) {
List<List<String>> results = new ArrayList<List<String>>();
List<String> resultUserLikes = new ArrayList<String>(), potentialCategs = new ArrayList<String>();
if (userLikes.size() < 6) {// too short, do not filter
results.add(userLikes);
results.add(potentialCategs);
return results;
}
for (String like : userLikes) {
like = like.toLowerCase();
if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
logger.info("removed isAlphanumeric " + like);
continue;
}
if (StringUtils.isNumeric(like)) {
logger.info("removed isNumericSpace " + like);
continue;
}
if (like.length() < 4) {
logger.info("removed too short likes " + like);
continue;
}
boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
String[] comps = like.split(" ");
StringBuffer buf = new StringBuffer();
for (String word : comps) {
boolean isCommon = isCommonWord(word);
boolean isName = isFirstName(word);
if (!isCommon)
allWordsCommonEnglish = false;
if (isName)
existFirstName = true;
if (isStopWord(word) || word.length() < 3)
bStop = true;
else
buf.append(word + " ");
} // / does not have to include stop word
if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
logger.info("moved to category: NoFirstName+AllCommonEng+ShorterThan3 "
+ like);
continue;
}
if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
logger.info("moved to category: NoFirstName+AllCommonEng+Short1word "
+ like);
potentialCategs.add(like);
continue;
}
if (existFirstName && comps.length == 1) {
logger.info("removed : only first name, no last name " + like);
continue;
}
resultUserLikes.add(buf.toString().trim());
}
resultUserLikes = new ArrayList<String>(new HashSet<String>(
resultUserLikes));
if (resultUserLikes.size() > 1) {
results.add(resultUserLikes);
results.add(potentialCategs);
return results;
}
else {// do not do reduction
results.add(userLikes);
results.add(potentialCategs);
return results;
}
}
public static boolean isAcceptableIndividualLikes(String like) {
StopList finder = StopList.getInstance();
like = like.toLowerCase();
if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
logger.info("removed isAlphanumeric " + like);
return false;
}
if (StringUtils.isNumeric(like)) {
logger.info("removed isNumericSpace " + like);
return false;
}
if (like.length() < 4) {
logger.info("removed too short likes " + like);
return false;
}
boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
String[] comps = like.split(" ");
StringBuffer buf = new StringBuffer();
for (String word : comps) {
boolean isCommon = finder.isCommonWord(word);
boolean isName = finder.isFirstName(word);
if (!isCommon)
allWordsCommonEnglish = false;
if (isName)
existFirstName = true;
if (finder.isStopWord(word) || word.length() < 3)
bStop = true;
else
buf.append(word + " ");
} // / does not have to include stop word
if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
logger.info(" NoFirstName+AllCommonEng+ShorterThan3 " + like);
return false;
}
if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
logger.info(" NoFirstName+AllCommonEng+Short1word " + like);
return false;
}
if (existFirstName && comps.length == 1) {
logger.info("removed : only first name, no last name " + like);
return false;
}
return true;
}
@SuppressWarnings("all")
public static void main(String[] args) {
StopList list = StopList
.getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
Boolean b = list.isCommonWord("demonstration");
String fname = list.getRandomFirstName();
b = list.isCommonEventWord("tour");
b = list.isCommonEventWord("dance");
b = list.isCommonEventWord("salsa");
b = list.isCommonEventWord("center");
b = list.isCommonEventWord("family");
b = isAcceptableIndividualLikes("forest glen");
b = isAcceptableIndividualLikes("drive");
b = isAcceptableIndividualLikes("house");
b = isAcceptableIndividualLikes("Timothy Kloug");
b = isAcceptableIndividualLikes("Mamma Mia");
}
}