lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.de;

 import java.util.Locale;

 // This file is encoded in UTF-8

 /**
  * A stemmer for German words.
  *
  * <p>The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words"
  * by J&ouml;rg Caumanns (joerg.caumanns at isst.fhg.de).
  */
 public class GermanStemmer {
   /** Buffer for the terms while stemming them. */
   private StringBuilder sb = new StringBuilder();

   /** Amount of characters that are removed with <code>substitute()</code> while stemming. */
   private int substCount = 0;

   private static final Locale locale = new Locale("de", "DE");

   /**
    * Stemms the given term to an unique <code>discriminator</code>.
    *
    * @param term The term that should be stemmed.
    * @return Discriminator for <code>term</code>
    */
   protected String stem(String term) {
     // Use lowercase for medium stemming.
     term = term.toLowerCase(locale);
     if (!isStemmable(term)) return term;
     // Reset the StringBuilder.
     sb.delete(0, sb.length());
     sb.insert(0, term);
     // Stemming starts here...
     substitute(sb);
     strip(sb);
     optimize(sb);
     resubstitute(sb);
     removeParticleDenotion(sb);
     return sb.toString();
   }

   /**
    * Checks if a term could be stemmed.
    *
    * @return true if, and only if, the given term consists in letters.
    */
   private boolean isStemmable(String term) {
     for (int c = 0; c < term.length(); c++) {
       if (!Character.isLetter(term.charAt(c))) return false;
     }
     return true;
   }

   /**
    * suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base"
    * suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build
    * of. The simplification causes some overstemming, and way more irregular stems, but still
    * provides unique. discriminators in the most of those cases. The algorithm is context free,
    * except of the length restrictions.
    */
   private void strip(StringBuilder buffer) {
     boolean doMore = true;
     while (doMore && buffer.length() > 3) {
       if ((buffer.length() + substCount > 5)
           && buffer.substring(buffer.length() - 2, buffer.length()).equals("nd")) {
         buffer.delete(buffer.length() - 2, buffer.length());
       } else if ((buffer.length() + substCount > 4)
           && buffer.substring(buffer.length() - 2, buffer.length()).equals("em")) {
         buffer.delete(buffer.length() - 2, buffer.length());
       } else if ((buffer.length() + substCount > 4)
           && buffer.substring(buffer.length() - 2, buffer.length()).equals("er")) {
         buffer.delete(buffer.length() - 2, buffer.length());
       } else if (buffer.charAt(buffer.length() - 1) == 'e') {
         buffer.deleteCharAt(buffer.length() - 1);
       } else if (buffer.charAt(buffer.length() - 1) == 's') {
         buffer.deleteCharAt(buffer.length() - 1);
       } else if (buffer.charAt(buffer.length() - 1) == 'n') {
         buffer.deleteCharAt(buffer.length() - 1);
       }
       // "t" occurs only as suffix of verbs.
       else if (buffer.charAt(buffer.length() - 1) == 't') {
         buffer.deleteCharAt(buffer.length() - 1);
       } else {
         doMore = false;
       }
     }
   }

   /** Does some optimizations on the term. This optimisations are contextual. */
   private void optimize(StringBuilder buffer) {
     // Additional step for female plurals of professions and inhabitants.
     if (buffer.length() > 5
         && buffer.substring(buffer.length() - 5, buffer.length()).equals("erin*")) {
       buffer.deleteCharAt(buffer.length() - 1);
       strip(buffer);
     }
     // Additional step for irregular plural nouns like "Matrizen -> Matrix".
     // NOTE: this length constraint is probably not a great value, it's just to prevent AIOOBE on
     // empty terms
     if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == ('z')) {
       buffer.setCharAt(buffer.length() - 1, 'x');
     }
   }

   /** Removes a particle denotion ("ge") from a term. */
   private void removeParticleDenotion(StringBuilder buffer) {
     if (buffer.length() > 4) {
       for (int c = 0; c < buffer.length() - 3; c++) {
         if (buffer.substring(c, c + 4).equals("gege")) {
           buffer.delete(c, c + 2);
           return;
         }
       }
     }
   }

   /**
    * Do some substitutions for the term to reduce overstemming:
    *
    * <p>- Substitute Umlauts with their corresponding vowel:{@code äöü -> aou}, "ß" is substituted
    * by "ss" - Substitute a second char of a pair of equal characters with an asterisk: {@code ?? ->
    * ?*} - Substitute some common character combinations with a token: {@code sch/ch/ei/ie/ig/st ->
    * $/§/%/&/#/!}
    */
   private void substitute(StringBuilder buffer) {
     substCount = 0;
     for (int c = 0; c < buffer.length(); c++) {
       // Replace the second char of a pair of the equal characters with an asterisk
       if (c > 0 && buffer.charAt(c) == buffer.charAt(c - 1)) {
         buffer.setCharAt(c, '*');
       }
       // Substitute Umlauts.
       else if (buffer.charAt(c) == 'ä') {
         buffer.setCharAt(c, 'a');
       } else if (buffer.charAt(c) == 'ö') {
         buffer.setCharAt(c, 'o');
       } else if (buffer.charAt(c) == 'ü') {
         buffer.setCharAt(c, 'u');
       }
       // Fix bug so that 'ß' at the end of a word is replaced.
       else if (buffer.charAt(c) == 'ß') {
         buffer.setCharAt(c, 's');
         buffer.insert(c + 1, 's');
         substCount++;
       }
       // Take care that at least one character is left left side from the current one
       if (c < buffer.length() - 1) {
         // Masking several common character combinations with an token
         if ((c < buffer.length() - 2)
             && buffer.charAt(c) == 's'
             && buffer.charAt(c + 1) == 'c'
             && buffer.charAt(c + 2) == 'h') {
           buffer.setCharAt(c, '$');
           buffer.delete(c + 1, c + 3);
           substCount += 2;
         } else if (buffer.charAt(c) == 'c' && buffer.charAt(c + 1) == 'h') {
           buffer.setCharAt(c, '§');
           buffer.deleteCharAt(c + 1);
           substCount++;
         } else if (buffer.charAt(c) == 'e' && buffer.charAt(c + 1) == 'i') {
           buffer.setCharAt(c, '%');
           buffer.deleteCharAt(c + 1);
           substCount++;
         } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'e') {
           buffer.setCharAt(c, '&');
           buffer.deleteCharAt(c + 1);
           substCount++;
         } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'g') {
           buffer.setCharAt(c, '#');
           buffer.deleteCharAt(c + 1);
           substCount++;
         } else if (buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 't') {
           buffer.setCharAt(c, '!');
           buffer.deleteCharAt(c + 1);
           substCount++;
         }
       }
     }
   }

   /**
    * Undoes the changes made by substitute(). That are character pairs and character combinations.
    * Umlauts will remain as their corresponding vowel, as "ß" remains as "ss".
    */
   private void resubstitute(StringBuilder buffer) {
     for (int c = 0; c < buffer.length(); c++) {
       if (buffer.charAt(c) == '*') {
         char x = buffer.charAt(c - 1);
         buffer.setCharAt(c, x);
       } else if (buffer.charAt(c) == '$') {
         buffer.setCharAt(c, 's');
         buffer.insert(c + 1, new char[] {'c', 'h'}, 0, 2);
       } else if (buffer.charAt(c) == '§') {
         buffer.setCharAt(c, 'c');
         buffer.insert(c + 1, 'h');
       } else if (buffer.charAt(c) == '%') {
         buffer.setCharAt(c, 'e');
         buffer.insert(c + 1, 'i');
       } else if (buffer.charAt(c) == '&') {
         buffer.setCharAt(c, 'i');
         buffer.insert(c + 1, 'e');
       } else if (buffer.charAt(c) == '#') {
         buffer.setCharAt(c, 'i');
         buffer.insert(c + 1, 'g');
       } else if (buffer.charAt(c) == '!') {
         buffer.setCharAt(c, 's');
         buffer.insert(c + 1, 't');
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.de;

	import java.util.Locale;

	// This file is encoded in UTF-8

	/**
	* A stemmer for German words.
	*
	* <p>The algorithm is based on the report "A Fast and Simple Stemming Algorithm for German Words"
	* by Jörg Caumanns (joerg.caumanns at isst.fhg.de).
	*/
	public class GermanStemmer {
	/** Buffer for the terms while stemming them. */
	private StringBuilder sb = new StringBuilder();

	/** Amount of characters that are removed with <code>substitute()</code> while stemming. */
	private int substCount = 0;

	private static final Locale locale = new Locale("de", "DE");

	/**
	* Stemms the given term to an unique <code>discriminator</code>.
	*
	* @param term The term that should be stemmed.
	* @return Discriminator for <code>term</code>
	*/
	protected String stem(String term) {
	// Use lowercase for medium stemming.
	term = term.toLowerCase(locale);
	if (!isStemmable(term)) return term;
	// Reset the StringBuilder.
	sb.delete(0, sb.length());
	sb.insert(0, term);
	// Stemming starts here...
	substitute(sb);
	strip(sb);
	optimize(sb);
	resubstitute(sb);
	removeParticleDenotion(sb);
	return sb.toString();
	}

	/**
	* Checks if a term could be stemmed.
	*
	* @return true if, and only if, the given term consists in letters.
	*/
	private boolean isStemmable(String term) {
	for (int c = 0; c < term.length(); c++) {
	if (!Character.isLetter(term.charAt(c))) return false;
	}
	return true;
	}

	/**
	* suffix stripping (stemming) on the current term. The stripping is reduced to the seven "base"
	* suffixes "e", "s", "n", "t", "em", "er" and * "nd", from which all regular suffixes are build
	* of. The simplification causes some overstemming, and way more irregular stems, but still
	* provides unique. discriminators in the most of those cases. The algorithm is context free,
	* except of the length restrictions.
	*/
	private void strip(StringBuilder buffer) {
	boolean doMore = true;
	while (doMore && buffer.length() > 3) {
	if ((buffer.length() + substCount > 5)
	&& buffer.substring(buffer.length() - 2, buffer.length()).equals("nd")) {
	buffer.delete(buffer.length() - 2, buffer.length());
	} else if ((buffer.length() + substCount > 4)
	&& buffer.substring(buffer.length() - 2, buffer.length()).equals("em")) {
	buffer.delete(buffer.length() - 2, buffer.length());
	} else if ((buffer.length() + substCount > 4)
	&& buffer.substring(buffer.length() - 2, buffer.length()).equals("er")) {
	buffer.delete(buffer.length() - 2, buffer.length());
	} else if (buffer.charAt(buffer.length() - 1) == 'e') {
	buffer.deleteCharAt(buffer.length() - 1);
	} else if (buffer.charAt(buffer.length() - 1) == 's') {
	buffer.deleteCharAt(buffer.length() - 1);
	} else if (buffer.charAt(buffer.length() - 1) == 'n') {
	buffer.deleteCharAt(buffer.length() - 1);
	}
	// "t" occurs only as suffix of verbs.
	else if (buffer.charAt(buffer.length() - 1) == 't') {
	buffer.deleteCharAt(buffer.length() - 1);
	} else {
	doMore = false;
	}
	}
	}

	/** Does some optimizations on the term. This optimisations are contextual. */
	private void optimize(StringBuilder buffer) {
	// Additional step for female plurals of professions and inhabitants.
	if (buffer.length() > 5
	&& buffer.substring(buffer.length() - 5, buffer.length()).equals("erin*")) {
	buffer.deleteCharAt(buffer.length() - 1);
	strip(buffer);
	}
	// Additional step for irregular plural nouns like "Matrizen -> Matrix".
	// NOTE: this length constraint is probably not a great value, it's just to prevent AIOOBE on
	// empty terms
	if (buffer.length() > 0 && buffer.charAt(buffer.length() - 1) == ('z')) {
	buffer.setCharAt(buffer.length() - 1, 'x');
	}
	}

	/** Removes a particle denotion ("ge") from a term. */
	private void removeParticleDenotion(StringBuilder buffer) {
	if (buffer.length() > 4) {
	for (int c = 0; c < buffer.length() - 3; c++) {
	if (buffer.substring(c, c + 4).equals("gege")) {
	buffer.delete(c, c + 2);
	return;
	}
	}
	}
	}

	/**
	* Do some substitutions for the term to reduce overstemming:
	*
	* <p>- Substitute Umlauts with their corresponding vowel:{@code äöü -> aou}, "ß" is substituted
	* by "ss" - Substitute a second char of a pair of equal characters with an asterisk: {@code ?? ->
	* ?*} - Substitute some common character combinations with a token: {@code sch/ch/ei/ie/ig/st ->
	* $/§/%/&/#/!}
	*/
	private void substitute(StringBuilder buffer) {
	substCount = 0;
	for (int c = 0; c < buffer.length(); c++) {
	// Replace the second char of a pair of the equal characters with an asterisk
	if (c > 0 && buffer.charAt(c) == buffer.charAt(c - 1)) {
	buffer.setCharAt(c, '*');
	}
	// Substitute Umlauts.
	else if (buffer.charAt(c) == 'ä') {
	buffer.setCharAt(c, 'a');
	} else if (buffer.charAt(c) == 'ö') {
	buffer.setCharAt(c, 'o');
	} else if (buffer.charAt(c) == 'ü') {
	buffer.setCharAt(c, 'u');
	}
	// Fix bug so that 'ß' at the end of a word is replaced.
	else if (buffer.charAt(c) == 'ß') {
	buffer.setCharAt(c, 's');
	buffer.insert(c + 1, 's');
	substCount++;
	}
	// Take care that at least one character is left left side from the current one
	if (c < buffer.length() - 1) {
	// Masking several common character combinations with an token
	if ((c < buffer.length() - 2)
	&& buffer.charAt(c) == 's'
	&& buffer.charAt(c + 1) == 'c'
	&& buffer.charAt(c + 2) == 'h') {
	buffer.setCharAt(c, '$');
	buffer.delete(c + 1, c + 3);
	substCount += 2;
	} else if (buffer.charAt(c) == 'c' && buffer.charAt(c + 1) == 'h') {
	buffer.setCharAt(c, '§');
	buffer.deleteCharAt(c + 1);
	substCount++;
	} else if (buffer.charAt(c) == 'e' && buffer.charAt(c + 1) == 'i') {
	buffer.setCharAt(c, '%');
	buffer.deleteCharAt(c + 1);
	substCount++;
	} else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'e') {
	buffer.setCharAt(c, '&');
	buffer.deleteCharAt(c + 1);
	substCount++;
	} else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'g') {
	buffer.setCharAt(c, '#');
	buffer.deleteCharAt(c + 1);
	substCount++;
	} else if (buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 't') {
	buffer.setCharAt(c, '!');
	buffer.deleteCharAt(c + 1);
	substCount++;
	}
	}
	}
	}

	/**
	* Undoes the changes made by substitute(). That are character pairs and character combinations.
	* Umlauts will remain as their corresponding vowel, as "ß" remains as "ss".
	*/
	private void resubstitute(StringBuilder buffer) {
	for (int c = 0; c < buffer.length(); c++) {
	if (buffer.charAt(c) == '*') {
	char x = buffer.charAt(c - 1);
	buffer.setCharAt(c, x);
	} else if (buffer.charAt(c) == '$') {
	buffer.setCharAt(c, 's');
	buffer.insert(c + 1, new char[] {'c', 'h'}, 0, 2);
	} else if (buffer.charAt(c) == '§') {
	buffer.setCharAt(c, 'c');
	buffer.insert(c + 1, 'h');
	} else if (buffer.charAt(c) == '%') {
	buffer.setCharAt(c, 'e');
	buffer.insert(c + 1, 'i');
	} else if (buffer.charAt(c) == '&') {
	buffer.setCharAt(c, 'i');
	buffer.insert(c + 1, 'e');
	} else if (buffer.charAt(c) == '#') {
	buffer.setCharAt(c, 'i');
	buffer.insert(c + 1, 'g');
	} else if (buffer.charAt(c) == '!') {
	buffer.setCharAt(c, 's');
	buffer.insert(c + 1, 't');
	}
	}
	}
	}