blob: eaed1d39d5bb768e637302da5a69a998f66948fc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.conceptMapper.support.tokens;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
import org.apache.uima.conceptMapper.Logger;
import org.apache.uima.conceptMapper.support.stemmer.Stemmer;
public class TokenNormalizer {
private static Pattern CapPat = Pattern.compile("^[A-Z][a-z]+$");
private static Pattern HasDigit = Pattern.compile("[0-9]");
/**
* replace instances of "," with the token "and" defaults to false
*/
private static final String PARAM_REPLACE_COMMA_WITH_AND = "ReplaceCommaWithAND";
/** Configuration parameter key/label for the case matching string */
public static final String PARAM_CASE_MATCH = "caseMatch";
/** Configuration parameter key/label for the stemmer class spec. If left out, no stemmer is used */
public static final String PARAM_STEMMER_CLASS = "Stemmer";
/**
* Configuration parameter key/label for the stemmer dictionary, passed into the stemmer's
* initialization method
*/
public static final String PARAM_STEMMER_DICT = "StemmerDictionary";
private boolean caseFoldAll;
private boolean caseFoldInitCap;
private boolean caseFoldDigit;
private String CASE_INSENSITIVE = "insensitive";
private String CASE_FOLD_DIGITS = "digitfold";
private String CASE_IGNORE = "ignoreall";
/** The stemmer that will perform the stemming. */
private Stemmer stemmer = null;
private boolean replaceCommaWithAND;
/**
* @param uimaContext -
* @param logger -
* @throws AnnotatorContextException -
*/
public TokenNormalizer(UimaContext uimaContext, Logger logger)
throws AnnotatorContextException {
super();
Boolean replaceCommaWithANDObj = (Boolean) uimaContext
.getConfigParameterValue(PARAM_REPLACE_COMMA_WITH_AND);
boolean replaceCommaWithAND = false;
if (replaceCommaWithANDObj != null) {
replaceCommaWithAND = replaceCommaWithANDObj.booleanValue();
}
String caseMatchParam = (String) uimaContext.getConfigParameterValue(PARAM_CASE_MATCH);
String stemmerParam = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_CLASS);
String stemmerDict = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_DICT);
this.replaceCommaWithAND = replaceCommaWithAND;
this.setCaseFoldInitCap(false);
this.setCaseFoldDigit(false);
this.setCaseFoldAll(false);
if (caseMatchParam != null) {
if (caseMatchParam.equalsIgnoreCase(CASE_INSENSITIVE)) {
this.setCaseFoldInitCap(true);
} else if (caseMatchParam.equalsIgnoreCase(CASE_FOLD_DIGITS)) {
this.setCaseFoldDigit(true);
} else if (caseMatchParam.equalsIgnoreCase(CASE_IGNORE)) {
this.setCaseFoldAll(true);
}
}
if (stemmerParam != null) {
try {
Class<?> stemmerClass = Class.forName(stemmerParam);
setStemmer((Stemmer) stemmerClass.newInstance());
getStemmer().initialize(stemmerDict);
} catch (Exception e) {
logger.logError("Exception trying to instantiate stemmer class: '" + stemmerParam
+ "', original exception:" + e.getMessage());
e.printStackTrace();
}
}
}
/**
* @return Returns the stemmer.
*/
public Stemmer getStemmer() {
return stemmer;
}
/**
* @param stemmer
* The stemmer to set.
*/
public void setStemmer(Stemmer stemmer) {
this.stemmer = stemmer;
}
public boolean shouldStem() {
return (getStemmer() != null);
}
/**
* @return Returns the caseFoldAll.
*/
public boolean isCaseFoldAll() {
return caseFoldAll;
}
/**
* @param caseFoldAll
* The caseFoldAll to set.
*/
public void setCaseFoldAll(boolean caseFoldAll) {
this.caseFoldAll = caseFoldAll;
}
/**
* @return Returns the caseFoldDigit.
*/
public boolean isCaseFoldDigit() {
return caseFoldDigit;
}
/**
* @param caseFoldDigit
* The caseFoldDigit to set.
*/
public void setCaseFoldDigit(boolean caseFoldDigit) {
this.caseFoldDigit = caseFoldDigit;
}
/**
* @return Returns the caseFoldInitCap.
*/
public boolean isCaseFoldInitCap() {
return caseFoldInitCap;
}
/**
* @param caseFoldInitCap
* The caseFoldInitCap to set.
*/
public void setCaseFoldInitCap(boolean caseFoldInitCap) {
this.caseFoldInitCap = caseFoldInitCap;
}
public boolean shouldFoldCase(String token) {
return (caseFoldAll || (caseFoldInitCap && CapPat.matcher(token).matches()) || (caseFoldDigit && HasDigit
.matcher(token).find()));
}
/**
* If one of the case folding flags is true and the input string matches the character pattern
* corresponding to that flag, then convert all letters to lowercase.
*
* @param token
* The string to case fold
*
* @return The case folded string
*/
public String foldCase(String token) {
if (shouldFoldCase(token)) {
return token.trim().toLowerCase();
}
return token;
}
/**
* If the stemming flag is true, then return the stemmed form of the supplied word using the
* Porter stemmer.
*
* @param token
* the word to stem
* @return the original word if the stemming flag is false, otherwise the stemmed form of the word
*/
public String stem(String token) {
if (shouldStem()) {
return getStemmer().stem(token.trim());
}
return token;
}
public String normalize(String token) {
if (replaceCommaWithAND && token.equals(",")) {
return stem(foldCase("and"));
}
return stem(foldCase(token));
}
}