blob: 232c0529c04b7edccbba371cba028dcb804e0edf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.conceptMapper.support.tokens;
import java.util.HashSet;
import java.util.Set;
import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.conceptMapper.Logger;
import org.apache.uima.conceptMapper.support.dictionaryResource.DictionaryToken;
public class TokenFilter {
private String tokenClassFeatureName;
private Feature tokenClassFeature = null;
private String tokenTypeFeatureName;
private Feature tokenTypeFeature = null;
private Logger logger;
/** Configuration parameter for list of token classes to include in lookups */
public static final String PARAM_INCLUDEDTOKENCLASSES = "IncludedTokenClasses";
// J5.0 protected Hashtable<String, Boolean> includedTokenClasses;
protected HashSet<String> includedTokenClasses;
/** Configuration parameter for list of token classes to include in lookups */
public static final String PARAM_EXCLUDEDTOKENCLASSES = "ExcludedTokenClasses";
// J5.0 protected Hashtable<String, Boolean> excludedTokenClasses;
protected HashSet<String> excludedTokenClasses;
/** Configuration parameter for list of token classes to include in lookups */
public static final String PARAM_INCLUDEDTOKENTYPES = "IncludedTokenTypes";
// J5.0 protected Hashtable<Integer, Boolean> includedTokenTypes;
protected HashSet<Integer> includedTokenTypes;
/** Configuration parameter for list of token classes to include in lookups */
public static final String PARAM_EXCLUDEDTOKENTYPES = "ExcludedTokenTypes";
// J5.0 protected Hashtable<Integer, Boolean> excludedTokenTypes;
protected HashSet<Integer> excludedTokenTypes;
public static final String PARAM_STOPWORDS = "StopWords";
private Set<String> stopWords = null;
/** Configuration parameter giving type of tokens */
public static final String PARAM_TOKENANNOTATION = "TokenAnnotation";
String tokenAnnotationName = null;
public TokenFilter(String tokenAnnotationName, String tokenTypeFeatureName,
String tokenClassFeatureName, Logger logger) {
super();
this.tokenAnnotationName = tokenAnnotationName;
this.tokenTypeFeatureName = tokenTypeFeatureName;
this.tokenClassFeatureName = tokenClassFeatureName;
this.logger = logger;
}
public String getTokenClassFeatureName() {
return tokenClassFeatureName;
}
public void setTokenClassFeatureName(String tokenClassFeatureName) {
this.tokenClassFeatureName = tokenClassFeatureName;
}
public Feature getTokenClassFeature() {
return tokenClassFeature;
}
public void setTokenClassFeature(Feature tokenClassFeature) {
this.tokenClassFeature = tokenClassFeature;
}
public String getTokenTypeFeatureName() {
return tokenTypeFeatureName;
}
public void setTokenTypeFeatureName(String tokenTypeFeatureName) {
this.tokenTypeFeatureName = tokenTypeFeatureName;
}
public Feature getTokenTypeFeature() {
return tokenTypeFeature;
}
public void setTokenTypeFeature(Feature tokenTypeFeature) {
this.tokenTypeFeature = tokenTypeFeature;
}
public String getTokenAnnotationName() {
return tokenAnnotationName;
}
public void setTokenAnnotationName(String tokenAnnotationName) {
this.tokenAnnotationName = tokenAnnotationName;
}
public void initConfig(UimaContext uimaContext) throws AnnotatorConfigurationException {
String[] stopWordList = null;
String[] includedTokenClassStrings = null;
String[] excludedTokenClassStrings = null;
Integer[] includedTokenTypeInts = null;
Integer[] excludedTokenTypeInts = null;
try {
stopWordList = (String[]) uimaContext.getConfigParameterValue(PARAM_STOPWORDS);
includedTokenClassStrings = (String[]) uimaContext
.getConfigParameterValue(PARAM_INCLUDEDTOKENCLASSES);
excludedTokenClassStrings = (String[]) uimaContext
.getConfigParameterValue(PARAM_EXCLUDEDTOKENCLASSES);
includedTokenTypeInts = (Integer[]) uimaContext
.getConfigParameterValue(PARAM_INCLUDEDTOKENTYPES);
excludedTokenTypeInts = (Integer[]) uimaContext
.getConfigParameterValue(PARAM_EXCLUDEDTOKENTYPES);
if ((includedTokenClassStrings == null) || (includedTokenClassStrings.length == 0)) {
includedTokenClasses = null;
} else {
includedTokenClasses = new HashSet<String>();
for (int j = 0; j < includedTokenClassStrings.length; j++) {
includedTokenClasses.add(includedTokenClassStrings[j]);
}
}
if ((excludedTokenClassStrings == null) || (excludedTokenClassStrings.length == 0)) {
excludedTokenClasses = null;
} else {
excludedTokenClasses = new HashSet<String>();
for (int j = 0; j < excludedTokenClassStrings.length; j++) {
excludedTokenClasses.add(excludedTokenClassStrings[j]);
}
}
if ((includedTokenTypeInts == null) || (includedTokenTypeInts.length == 0)) {
includedTokenTypes = null;
} else {
includedTokenTypes = new HashSet<Integer>();
for (int j = 0; j < includedTokenTypeInts.length; j++) {
includedTokenTypes.add(includedTokenTypeInts[j]);
}
}
if ((excludedTokenTypeInts == null) || (excludedTokenTypeInts.length == 0)) {
excludedTokenTypes = null;
} else {
excludedTokenTypes = new HashSet<Integer>();
for (int j = 0; j < excludedTokenTypeInts.length; j++) {
excludedTokenTypes.add(excludedTokenTypeInts[j]);
}
}
if ((includedTokenClasses != null) || (excludedTokenClasses != null)) {
if (tokenClassFeatureName == null) {
throw new AnnotatorConfigurationException();
}
}
if ((includedTokenTypes != null) || (excludedTokenTypes != null)) {
if (tokenTypeFeatureName == null) {
throw new AnnotatorConfigurationException();
}
}
stopWords = initializeStopWordList(stopWordList);
} catch (AnnotatorContextException ie) {
throw new AnnotatorConfigurationException(ie);
}
}
static public Set<String> initializeStopWordList(String[] stopWordsStrings)
throws AnnotatorContextException {
Set<String> retVal = new HashSet<String>();
if (stopWordsStrings != null) {
for (int i = 0; i < stopWordsStrings.length; i++) {
// System.err.println("Adding stopword: '" + stopWordsStrings[i].toLowerCase ());
retVal.add(stopWordsStrings[i].toLowerCase());
}
}
return retVal;
}
/**
* @param token
* tokenClass to look up
* @return true if in includedTokenClasses or if both includedTokenClasses and
* excludedTokenClasses are unset, of if excludedTokenClasses does not contain an entry
* for tokenClass parameter
*/
public boolean checkTokenClass(AnnotationFS token) {
boolean returnValue = true;
if (tokenClassFeature != null) {
String tokenClass = token.getStringValue(tokenClassFeature);
if (tokenClass != null) {
returnValue = isOK_TokenClass(tokenClass);
}
}
// System.err.println ("checkTokenClass, token = " + token.getCoveredText() + ", returnValue: "
// + returnValue);
return returnValue;
}
public boolean checkTokenClass(DictionaryToken token) {
boolean returnValue = true;
if (token.isTokenClassFeatureDefined()) {
String tokenClass = token.getTokenClass();
if (tokenClass != null) {
returnValue = isOK_TokenClass(tokenClass);
}
}
// System.err.println ("checkTokenClass, token = " + token.getCoveredText() + ", returnValue: "
// + returnValue);
return returnValue;
}
private boolean isOK_TokenClass(String tokenClass) {
boolean returnValue = true;
if ((includedTokenClasses != null) && (excludedTokenClasses == null)) {
returnValue = (includedTokenClasses.contains(tokenClass));
} else if (excludedTokenClasses != null) {
returnValue = (!excludedTokenClasses.contains(tokenClass));
}
return returnValue;
}
static public boolean isStopWord(Set<String> stopWords, String tokenText) {
// System.err.println("isStopWord, token = '" + tokenText + "', returns = " + ((stopWords !=
// null) && stopWords.contains (tokenText.toLowerCase ())));
return ((stopWords != null) && stopWords.contains(tokenText.toLowerCase()));
}
public boolean isStopWord(String tokenText) {
// System.err.println("isStopWord, token = '" + tokenText + "', returns = " + ((stopWords !=
// null) && stopWords.contains (tokenText.toLowerCase ())));
return ((stopWords != null) && stopWords.contains(tokenText.toLowerCase()));
}
/**
* @param token -
* @return false if tokenTypeFeature is set, and the token's tokenTypeFeature slot is set, but the value is not OK
*/
public boolean checkTokenType(AnnotationFS token) {
boolean returnValue = true;
if (tokenTypeFeature != null) {
Integer tokenType = Integer.valueOf (token.getIntValue(tokenTypeFeature));
if (tokenType != null) {
returnValue = isOK_TokenType(tokenType);
}
}
// System.err.println ("checkTokenType, token = " + token.getCoveredText() + ", returnValue: " +
// returnValue);
return returnValue;
}
public boolean checkTokenType(DictionaryToken token) {
boolean returnValue = true;
if (token.isTokenTypeFeatureDefined()) {
Integer tokenType = token.getType();
if (tokenType != null) {
returnValue = isOK_TokenType(tokenType);
}
}
// System.err.println ("checkTokenType, token = " + token.getCoveredText() + ", returnValue: " +
// returnValue);
return returnValue;
}
private boolean isOK_TokenType(Integer tokenType) {
boolean returnValue = true;
if ((includedTokenTypes != null) && (excludedTokenTypes == null)) {
returnValue = (includedTokenTypes.contains(tokenType));
} else if (excludedTokenTypes != null) {
returnValue = (!excludedTokenTypes.contains(tokenType));
}
return returnValue;
}
public void initTypes(TypeSystem typeSystem) throws UnknownTypeException {
initTypes(typeSystem, true);
}
/**
* @param typeSystem =
* @param requireFeatureExistence -
* if true, if the tokenType and/or tokenClass features of the tokenAnnotation are
* specified, they must exist. This is to allow for the situation where these features
* might not exist during dictionary loading, but are needed at annotator runtime
* @throws UnknownTypeException -
*/
public void initTypes(TypeSystem typeSystem, boolean requireFeatureExistence)
throws UnknownTypeException {
Type tokenType = typeSystem.getType(tokenAnnotationName);
if (tokenType == null) {
String message = "TokenFilter.initTypes(), Could not find type: " + tokenAnnotationName;
System.err.println(message);
throw new UnknownTypeException(message);
}
if ((tokenClassFeatureName == null) || (tokenClassFeatureName.equals(""))) {
tokenClassFeature = null;
} else {
tokenClassFeature = tokenType.getFeatureByBaseName(tokenClassFeatureName);
if ((tokenClassFeature == null) && (requireFeatureExistence)) {
String message = "Token class feature name '" + tokenClassFeatureName
+ "' specified, but does not exist for type: " + tokenType.getName();
logger.logError(message);
throw new UnknownTypeException(message);
}
}
if ((tokenTypeFeatureName == null) || (tokenTypeFeatureName.equals(""))) {
tokenTypeFeature = null;
} else {
tokenTypeFeature = tokenType.getFeatureByBaseName(tokenTypeFeatureName);
if ((tokenTypeFeature == null) && (requireFeatureExistence)) {
String message = "Token type feature name '" + tokenTypeFeatureName
+ "' specified, but does not exist for type: " + tokenType.getName();
logger.logError(message);
throw new UnknownTypeException(message);
}
}
}
public boolean isOK_Token(AnnotationFS token, TokenNormalizer tokenNormalizer) {
if (checkTokenClass(token) && checkTokenType(token)
&& (!isStopWord(tokenNormalizer.normalize(token.getCoveredText())))) {
return true;
}
return false;
}
public boolean isOK_Token(DictionaryToken token, TokenNormalizer tokenNormalizer) {
if (checkTokenClass(token) && checkTokenType(token)
&& (!isStopWord(tokenNormalizer.normalize(token.getText())))) {
// System.err.println ("isOK_Token = true");
return true;
}
// System.err.println ("isOK_Token = false");
return false;
}
}