ConceptMapper/src/main/java/org/apache/uima/conceptMapper/support/tokens/TokenNormalizer.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.conceptMapper.support.tokens;

 import java.util.regex.Pattern;

 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
 import org.apache.uima.conceptMapper.Logger;
 import org.apache.uima.conceptMapper.support.stemmer.Stemmer;

 public class TokenNormalizer {
   private static Pattern CapPat = Pattern.compile("^[A-Z][a-z]+$");

   private static Pattern HasDigit = Pattern.compile("[0-9]");

   /**
    * replace instances of "," with the token "and" defaults to false
    */
   private static final String PARAM_REPLACE_COMMA_WITH_AND = "ReplaceCommaWithAND";

   /** Configuration parameter key/label for the case matching string */
   public static final String PARAM_CASE_MATCH = "caseMatch";

   /** Configuration parameter key/label for the stemmer class spec. If left out, no stemmer is used */
   public static final String PARAM_STEMMER_CLASS = "Stemmer";

   /**
    * Configuration parameter key/label for the stemmer dictionary, passed into the stemmer's
    * initialization method
    */
   public static final String PARAM_STEMMER_DICT = "StemmerDictionary";

   private boolean caseFoldAll;

   private boolean caseFoldInitCap;

   private boolean caseFoldDigit;

   private String CASE_INSENSITIVE = "insensitive";

   private String CASE_FOLD_DIGITS = "digitfold";

   private String CASE_IGNORE = "ignoreall";

   /** The stemmer that will perform the stemming. */
   private Stemmer stemmer = null;

   private boolean replaceCommaWithAND;

   /**
    * @param uimaContext -
    * @param logger -
    * @throws AnnotatorContextException -
    */
   public TokenNormalizer(UimaContext uimaContext, Logger logger)
           throws AnnotatorContextException {
     super();
     Boolean replaceCommaWithANDObj = (Boolean) uimaContext
             .getConfigParameterValue(PARAM_REPLACE_COMMA_WITH_AND);
     boolean replaceCommaWithAND = false;
     if (replaceCommaWithANDObj != null) {
       replaceCommaWithAND = replaceCommaWithANDObj.booleanValue();
     }
     String caseMatchParam = (String) uimaContext.getConfigParameterValue(PARAM_CASE_MATCH);
     String stemmerParam = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_CLASS);
     String stemmerDict = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_DICT);

     this.replaceCommaWithAND = replaceCommaWithAND;
     this.setCaseFoldInitCap(false);
     this.setCaseFoldDigit(false);
     this.setCaseFoldAll(false);

     if (caseMatchParam != null) {
       if (caseMatchParam.equalsIgnoreCase(CASE_INSENSITIVE)) {
         this.setCaseFoldInitCap(true);
       } else if (caseMatchParam.equalsIgnoreCase(CASE_FOLD_DIGITS)) {
         this.setCaseFoldDigit(true);
       } else if (caseMatchParam.equalsIgnoreCase(CASE_IGNORE)) {
         this.setCaseFoldAll(true);
       }
     }

     if (stemmerParam != null) {
       try {
         Class<?> stemmerClass = Class.forName(stemmerParam);
         setStemmer((Stemmer) stemmerClass.newInstance());
         getStemmer().initialize(stemmerDict);
       } catch (Exception e) {
         logger.logError("Exception trying to instantiate stemmer class: '" + stemmerParam
                 + "', original exception:" + e.getMessage());
         e.printStackTrace();
       }
     }
   }

   /**
    * @return Returns the stemmer.
    */
   public Stemmer getStemmer() {
     return stemmer;
   }

   /**
    * @param stemmer
    *          The stemmer to set.
    */
   public void setStemmer(Stemmer stemmer) {
     this.stemmer = stemmer;
   }

   public boolean shouldStem() {
     return (getStemmer() != null);
   }

   /**
    * @return Returns the caseFoldAll.
    */
   public boolean isCaseFoldAll() {
     return caseFoldAll;
   }

   /**
    * @param caseFoldAll
    *          The caseFoldAll to set.
    */
   public void setCaseFoldAll(boolean caseFoldAll) {
     this.caseFoldAll = caseFoldAll;
   }

   /**
    * @return Returns the caseFoldDigit.
    */
   public boolean isCaseFoldDigit() {
     return caseFoldDigit;
   }

   /**
    * @param caseFoldDigit
    *          The caseFoldDigit to set.
    */
   public void setCaseFoldDigit(boolean caseFoldDigit) {
     this.caseFoldDigit = caseFoldDigit;
   }

   /**
    * @return Returns the caseFoldInitCap.
    */
   public boolean isCaseFoldInitCap() {
     return caseFoldInitCap;
   }

   /**
    * @param caseFoldInitCap
    *          The caseFoldInitCap to set.
    */
   public void setCaseFoldInitCap(boolean caseFoldInitCap) {
     this.caseFoldInitCap = caseFoldInitCap;
   }

   public boolean shouldFoldCase(String token) {
     return (caseFoldAll || (caseFoldInitCap && CapPat.matcher(token).matches()) || (caseFoldDigit && HasDigit
             .matcher(token).find()));
   }

   /**
    * If one of the case folding flags is true and the input string matches the character pattern
    * corresponding to that flag, then convert all letters to lowercase.
    *
    * @param token
    *          The string to case fold
    *
    * @return The case folded string
    */
   public String foldCase(String token) {
     if (shouldFoldCase(token)) {
       return token.trim().toLowerCase();
     }
     return token;
   }

   /**
    * If the stemming flag is true, then return the stemmed form of the supplied word using the
    * Porter stemmer.
    *
    * @param token
    *          the word to stem
    * @return the original word if the stemming flag is false, otherwise the stemmed form of the word
    */
   public String stem(String token) {
     if (shouldStem()) {
       return getStemmer().stem(token.trim());
     }
     return token;
   }

   public String normalize(String token) {
     if (replaceCommaWithAND && token.equals(",")) {
       return stem(foldCase("and"));
     }
     return stem(foldCase(token));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.conceptMapper.support.tokens;

	import java.util.regex.Pattern;

	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
	import org.apache.uima.conceptMapper.Logger;
	import org.apache.uima.conceptMapper.support.stemmer.Stemmer;

	public class TokenNormalizer {
	private static Pattern CapPat = Pattern.compile("^[A-Z][a-z]+$");

	private static Pattern HasDigit = Pattern.compile("[0-9]");

	/**
	* replace instances of "," with the token "and" defaults to false
	*/
	private static final String PARAM_REPLACE_COMMA_WITH_AND = "ReplaceCommaWithAND";

	/** Configuration parameter key/label for the case matching string */
	public static final String PARAM_CASE_MATCH = "caseMatch";

	/** Configuration parameter key/label for the stemmer class spec. If left out, no stemmer is used */
	public static final String PARAM_STEMMER_CLASS = "Stemmer";

	/**
	* Configuration parameter key/label for the stemmer dictionary, passed into the stemmer's
	* initialization method
	*/
	public static final String PARAM_STEMMER_DICT = "StemmerDictionary";

	private boolean caseFoldAll;

	private boolean caseFoldInitCap;

	private boolean caseFoldDigit;

	private String CASE_INSENSITIVE = "insensitive";

	private String CASE_FOLD_DIGITS = "digitfold";

	private String CASE_IGNORE = "ignoreall";

	/** The stemmer that will perform the stemming. */
	private Stemmer stemmer = null;

	private boolean replaceCommaWithAND;

	/**
	* @param uimaContext -
	* @param logger -
	* @throws AnnotatorContextException -
	*/
	public TokenNormalizer(UimaContext uimaContext, Logger logger)
	throws AnnotatorContextException {
	super();
	Boolean replaceCommaWithANDObj = (Boolean) uimaContext
	.getConfigParameterValue(PARAM_REPLACE_COMMA_WITH_AND);
	boolean replaceCommaWithAND = false;
	if (replaceCommaWithANDObj != null) {
	replaceCommaWithAND = replaceCommaWithANDObj.booleanValue();
	}
	String caseMatchParam = (String) uimaContext.getConfigParameterValue(PARAM_CASE_MATCH);
	String stemmerParam = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_CLASS);
	String stemmerDict = (String) uimaContext.getConfigParameterValue(PARAM_STEMMER_DICT);

	this.replaceCommaWithAND = replaceCommaWithAND;
	this.setCaseFoldInitCap(false);
	this.setCaseFoldDigit(false);
	this.setCaseFoldAll(false);

	if (caseMatchParam != null) {
	if (caseMatchParam.equalsIgnoreCase(CASE_INSENSITIVE)) {
	this.setCaseFoldInitCap(true);
	} else if (caseMatchParam.equalsIgnoreCase(CASE_FOLD_DIGITS)) {
	this.setCaseFoldDigit(true);
	} else if (caseMatchParam.equalsIgnoreCase(CASE_IGNORE)) {
	this.setCaseFoldAll(true);
	}
	}

	if (stemmerParam != null) {
	try {
	Class<?> stemmerClass = Class.forName(stemmerParam);
	setStemmer((Stemmer) stemmerClass.newInstance());
	getStemmer().initialize(stemmerDict);
	} catch (Exception e) {
	logger.logError("Exception trying to instantiate stemmer class: '" + stemmerParam
	+ "', original exception:" + e.getMessage());
	e.printStackTrace();
	}
	}
	}

	/**
	* @return Returns the stemmer.
	*/
	public Stemmer getStemmer() {
	return stemmer;
	}

	/**
	* @param stemmer
	* The stemmer to set.
	*/
	public void setStemmer(Stemmer stemmer) {
	this.stemmer = stemmer;
	}

	public boolean shouldStem() {
	return (getStemmer() != null);
	}

	/**
	* @return Returns the caseFoldAll.
	*/
	public boolean isCaseFoldAll() {
	return caseFoldAll;
	}

	/**
	* @param caseFoldAll
	* The caseFoldAll to set.
	*/
	public void setCaseFoldAll(boolean caseFoldAll) {
	this.caseFoldAll = caseFoldAll;
	}

	/**
	* @return Returns the caseFoldDigit.
	*/
	public boolean isCaseFoldDigit() {
	return caseFoldDigit;
	}

	/**
	* @param caseFoldDigit
	* The caseFoldDigit to set.
	*/
	public void setCaseFoldDigit(boolean caseFoldDigit) {
	this.caseFoldDigit = caseFoldDigit;
	}

	/**
	* @return Returns the caseFoldInitCap.
	*/
	public boolean isCaseFoldInitCap() {
	return caseFoldInitCap;
	}

	/**
	* @param caseFoldInitCap
	* The caseFoldInitCap to set.
	*/
	public void setCaseFoldInitCap(boolean caseFoldInitCap) {
	this.caseFoldInitCap = caseFoldInitCap;
	}

	public boolean shouldFoldCase(String token) {
	return (caseFoldAll \|\| (caseFoldInitCap && CapPat.matcher(token).matches()) \|\| (caseFoldDigit && HasDigit
	.matcher(token).find()));
	}

	/**
	* If one of the case folding flags is true and the input string matches the character pattern
	* corresponding to that flag, then convert all letters to lowercase.
	*
	* @param token
	* The string to case fold
	*
	* @return The case folded string
	*/
	public String foldCase(String token) {
	if (shouldFoldCase(token)) {
	return token.trim().toLowerCase();
	}
	return token;
	}

	/**
	* If the stemming flag is true, then return the stemmed form of the supplied word using the
	* Porter stemmer.
	*
	* @param token
	* the word to stem
	* @return the original word if the stemming flag is false, otherwise the stemmed form of the word
	*/
	public String stem(String token) {
	if (shouldStem()) {
	return getStemmer().stem(token.trim());
	}
	return token;
	}

	public String normalize(String token) {
	if (replaceCommaWithAND && token.equals(",")) {
	return stem(foldCase("and"));
	}
	return stem(foldCase(token));
	}
	}