src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemmer.cs - lucenenet - Git at Google

 using System;
 using System.Globalization;
 using System.Text;

 namespace Lucene.Net.Analysis.De
 {
     // This file is encoded in UTF-8

     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A stemmer for German words.
     /// <para>
     /// The algorithm is based on the report
     /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
     /// Caumanns (joerg.caumanns at isst.fhg.de).
     /// </para>
     /// </summary>
     public class GermanStemmer
     {
         /// <summary>
         /// Buffer for the terms while stemming them.
         /// </summary>
         private readonly StringBuilder sb = new StringBuilder();

         /// <summary>
         /// Amount of characters that are removed with <see cref="Substitute"/> while stemming.
         /// </summary>
         private int substCount = 0;

         private static readonly CultureInfo locale = new CultureInfo("de-DE");

         /// <summary>
         /// Stemms the given term to an unique <c>discriminator</c>.
         /// </summary>
         /// <param name="term">  The term that should be stemmed. </param>
         /// <returns>      Discriminator for <paramref name="term"/> </returns>
         protected internal virtual string Stem(string term)
         {
             // Use lowercase for medium stemming.
             term = locale.TextInfo.ToLower(term);
             if (!IsStemmable(term))
             {
                 return term;
             }
             // Reset the StringBuilder.
             sb.Remove(0, sb.Length);
             sb.Insert(0, term);
             // Stemming starts here...
             Substitute(sb);
             Strip(sb);
             Optimize(sb);
             Resubstitute(sb);
             RemoveParticleDenotion(sb);
             return sb.ToString();
         }

         /// <summary>
         /// Checks if a term could be stemmed.
         /// </summary>
         /// <returns>  true if, and only if, the given term consists in letters. </returns>
         private bool IsStemmable(string term)
         {
             for (int c = 0; c < term.Length; c++)
             {
                 if (!char.IsLetter(term[c]))
                 {
                     return false;
                 }
             }
             return true;
         }

         /// <summary>
         /// suffix stripping (stemming) on the current term. The stripping is reduced
         /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
         /// from which all regular suffixes are build of. The simplification causes
         /// some overstemming, and way more irregular stems, but still provides unique.
         /// discriminators in the most of those cases.
         /// The algorithm is context free, except of the length restrictions.
         /// </summary>
         private void Strip(StringBuilder buffer)
         {
             bool doMore = true;
             while (doMore && buffer.Length > 3)
             {
                 if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("nd", StringComparison.Ordinal))
                 {
                     buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
                 }
                 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("em", StringComparison.Ordinal))
                 {
                     buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
                 }
                 else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("er", StringComparison.Ordinal))
                 {
                     buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
                 }
                 else if (buffer[buffer.Length - 1] == 'e')
                 {
                     buffer.Remove(buffer.Length - 1, 1);
                 }
                 else if (buffer[buffer.Length - 1] == 's')
                 {
                     buffer.Remove(buffer.Length - 1, 1);
                 }
                 else if (buffer[buffer.Length - 1] == 'n')
                 {
                     buffer.Remove(buffer.Length - 1, 1);
                 }
                 // "t" occurs only as suffix of verbs.
                 else if (buffer[buffer.Length - 1] == 't')
                 {
                     buffer.Remove(buffer.Length - 1, 1);
                 }
                 else
                 {
                     doMore = false;
                 }
             }
         }

         /// <summary>
         /// Does some optimizations on the term. This optimisations are
         /// contextual.
         /// </summary>
         private void Optimize(StringBuilder buffer)
         {
             // Additional step for female plurals of professions and inhabitants.
             if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length - (buffer.Length - 5)).Equals("erin*", StringComparison.Ordinal))
             {
                 buffer.Remove(buffer.Length - 1, 1);
                 Strip(buffer);
             }
             // Additional step for irregular plural nouns like "Matrizen -> Matrix".
             // NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
             if (buffer.Length > 0 && buffer[buffer.Length - 1] == ('z'))
             {
                 buffer[buffer.Length - 1] = 'x';
             }
         }

         /// <summary>
         /// Removes a particle denotion ("ge") from a term.
         /// </summary>
         private void RemoveParticleDenotion(StringBuilder buffer)
         {
             if (buffer.Length > 4)
             {
                 for (int c = 0; c < buffer.Length - 3; c++)
                 {
                     if (buffer.ToString(c, 4).Equals("gege", StringComparison.Ordinal))
                     {
                         buffer.Remove(c, (c + 2) - c);
                         return;
                     }
                 }
             }
         }

         /// <summary>
         /// Do some substitutions for the term to reduce overstemming:
         ///
         /// <list type="bullet">
         /// <item><description>Substitute Umlauts with their corresponding vowel: äöü -> aou,
         ///   "ß" is substituted by "ss"</description></item>
         /// <item><description>Substitute a second char of a pair of equal characters with
         ///   an asterisk: ?? -> ?*</description></item>
         /// <item><description>Substitute some common character combinations with a token:
         ///   sch/ch/ei/ie/ig/st -> $/§/%/&amp;/#/!</description></item>
         /// </list>
         /// </summary>
         private void Substitute(StringBuilder buffer)
         {
             substCount = 0;
             for (int c = 0; c < buffer.Length; c++)
             {
                 // Replace the second char of a pair of the equal characters with an asterisk
                 if (c > 0 && buffer[c] == buffer[c - 1])
                 {
                     buffer[c] = '*';
                 }
                 // Substitute Umlauts.
                 else if (buffer[c] == 'ä')
                 {
                     buffer[c] = 'a';
                 }
                 else if (buffer[c] == 'ö')
                 {
                     buffer[c] = 'o';
                 }
                 else if (buffer[c] == 'ü')
                 {
                     buffer[c] = 'u';
                 }
                 // Fix bug so that 'ß' at the end of a word is replaced.
                 else if (buffer[c] == 'ß')
                 {
                     buffer[c] = 's';
                     buffer.Insert(c + 1, 's');
                     substCount++;
                 }
                 // Take care that at least one character is left left side from the current one
                 if (c < buffer.Length - 1)
                 {
                     // Masking several common character combinations with an token
                     if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
                     {
                         buffer[c] = '$';
                         buffer.Remove(c + 1, (c + 3) - (c + 1));
                         substCount = +2;
                     }
                     else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
                     {
                         buffer[c] = '§';
                         buffer.Remove(c + 1, 1);
                         substCount++;
                     }
                     else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
                     {
                         buffer[c] = '%';
                         buffer.Remove(c + 1, 1);
                         substCount++;
                     }
                     else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
                     {
                         buffer[c] = '&';
                         buffer.Remove(c + 1, 1);
                         substCount++;
                     }
                     else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
                     {
                         buffer[c] = '#';
                         buffer.Remove(c + 1, 1);
                         substCount++;
                     }
                     else if (buffer[c] == 's' && buffer[c + 1] == 't')
                     {
                         buffer[c] = '!';
                         buffer.Remove(c + 1, 1);
                         substCount++;
                     }
                 }
             }
         }

         /// <summary>
         /// Undoes the changes made by <see cref="Substitute"/>. That are character pairs and
         /// character combinations. Umlauts will remain as their corresponding vowel,
         /// as "ß" remains as "ss".
         /// </summary>
         private void Resubstitute(StringBuilder buffer)
         {
             for (int c = 0; c < buffer.Length; c++)
             {
                 if (buffer[c] == '*')
                 {
                     char x = buffer[c - 1];
                     buffer[c] = x;
                 }
                 else if (buffer[c] == '$')
                 {
                     buffer[c] = 's';
                     buffer.Insert(c + 1, new char[] { 'c', 'h' }, 0, 2);
                 }
                 else if (buffer[c] == '§')
                 {
                     buffer[c] = 'c';
                     buffer.Insert(c + 1, 'h');
                 }
                 else if (buffer[c] == '%')
                 {
                     buffer[c] = 'e';
                     buffer.Insert(c + 1, 'i');
                 }
                 else if (buffer[c] == '&')
                 {
                     buffer[c] = 'i';
                     buffer.Insert(c + 1, 'e');
                 }
                 else if (buffer[c] == '#')
                 {
                     buffer[c] = 'i';
                     buffer.Insert(c + 1, 'g');
                 }
                 else if (buffer[c] == '!')
                 {
                     buffer[c] = 's';
                     buffer.Insert(c + 1, 't');
                 }
             }
         }
     }
 }
	using System;
	using System.Globalization;
	using System.Text;

	namespace Lucene.Net.Analysis.De
	{
	// This file is encoded in UTF-8

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A stemmer for German words.
	/// <para>
	/// The algorithm is based on the report
	/// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
	/// Caumanns (joerg.caumanns at isst.fhg.de).
	/// </para>
	/// </summary>
	public class GermanStemmer
	{
	/// <summary>
	/// Buffer for the terms while stemming them.
	/// </summary>
	private readonly StringBuilder sb = new StringBuilder();

	/// <summary>
	/// Amount of characters that are removed with <see cref="Substitute"/> while stemming.
	/// </summary>
	private int substCount = 0;

	private static readonly CultureInfo locale = new CultureInfo("de-DE");

	/// <summary>
	/// Stemms the given term to an unique <c>discriminator</c>.
	/// </summary>
	/// <param name="term"> The term that should be stemmed. </param>
	/// <returns> Discriminator for <paramref name="term"/> </returns>
	protected internal virtual string Stem(string term)
	{
	// Use lowercase for medium stemming.
	term = locale.TextInfo.ToLower(term);
	if (!IsStemmable(term))
	{
	return term;
	}
	// Reset the StringBuilder.
	sb.Remove(0, sb.Length);
	sb.Insert(0, term);
	// Stemming starts here...
	Substitute(sb);
	Strip(sb);
	Optimize(sb);
	Resubstitute(sb);
	RemoveParticleDenotion(sb);
	return sb.ToString();
	}

	/// <summary>
	/// Checks if a term could be stemmed.
	/// </summary>
	/// <returns> true if, and only if, the given term consists in letters. </returns>
	private bool IsStemmable(string term)
	{
	for (int c = 0; c < term.Length; c++)
	{
	if (!char.IsLetter(term[c]))
	{
	return false;
	}
	}
	return true;
	}

	/// <summary>
	/// suffix stripping (stemming) on the current term. The stripping is reduced
	/// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
	/// from which all regular suffixes are build of. The simplification causes
	/// some overstemming, and way more irregular stems, but still provides unique.
	/// discriminators in the most of those cases.
	/// The algorithm is context free, except of the length restrictions.
	/// </summary>
	private void Strip(StringBuilder buffer)
	{
	bool doMore = true;
	while (doMore && buffer.Length > 3)
	{
	if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("nd", StringComparison.Ordinal))
	{
	buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
	}
	else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("em", StringComparison.Ordinal))
	{
	buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
	}
	else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("er", StringComparison.Ordinal))
	{
	buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
	}
	else if (buffer[buffer.Length - 1] == 'e')
	{
	buffer.Remove(buffer.Length - 1, 1);
	}
	else if (buffer[buffer.Length - 1] == 's')
	{
	buffer.Remove(buffer.Length - 1, 1);
	}
	else if (buffer[buffer.Length - 1] == 'n')
	{
	buffer.Remove(buffer.Length - 1, 1);
	}
	// "t" occurs only as suffix of verbs.
	else if (buffer[buffer.Length - 1] == 't')
	{
	buffer.Remove(buffer.Length - 1, 1);
	}
	else
	{
	doMore = false;
	}
	}
	}

	/// <summary>
	/// Does some optimizations on the term. This optimisations are
	/// contextual.
	/// </summary>
	private void Optimize(StringBuilder buffer)
	{
	// Additional step for female plurals of professions and inhabitants.
	if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length - (buffer.Length - 5)).Equals("erin*", StringComparison.Ordinal))
	{
	buffer.Remove(buffer.Length - 1, 1);
	Strip(buffer);
	}
	// Additional step for irregular plural nouns like "Matrizen -> Matrix".
	// NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
	if (buffer.Length > 0 && buffer[buffer.Length - 1] == ('z'))
	{
	buffer[buffer.Length - 1] = 'x';
	}
	}

	/// <summary>
	/// Removes a particle denotion ("ge") from a term.
	/// </summary>
	private void RemoveParticleDenotion(StringBuilder buffer)
	{
	if (buffer.Length > 4)
	{
	for (int c = 0; c < buffer.Length - 3; c++)
	{
	if (buffer.ToString(c, 4).Equals("gege", StringComparison.Ordinal))
	{
	buffer.Remove(c, (c + 2) - c);
	return;
	}
	}
	}
	}

	/// <summary>
	/// Do some substitutions for the term to reduce overstemming:
	///
	/// <list type="bullet">
	/// <item><description>Substitute Umlauts with their corresponding vowel: äöü -> aou,
	/// "ß" is substituted by "ss"</description></item>
	/// <item><description>Substitute a second char of a pair of equal characters with
	/// an asterisk: ?? -> ?*</description></item>
	/// <item><description>Substitute some common character combinations with a token:
	/// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!</description></item>
	/// </list>
	/// </summary>
	private void Substitute(StringBuilder buffer)
	{
	substCount = 0;
	for (int c = 0; c < buffer.Length; c++)
	{
	// Replace the second char of a pair of the equal characters with an asterisk
	if (c > 0 && buffer[c] == buffer[c - 1])
	{
	buffer[c] = '*';
	}
	// Substitute Umlauts.
	else if (buffer[c] == 'ä')
	{
	buffer[c] = 'a';
	}
	else if (buffer[c] == 'ö')
	{
	buffer[c] = 'o';
	}
	else if (buffer[c] == 'ü')
	{
	buffer[c] = 'u';
	}
	// Fix bug so that 'ß' at the end of a word is replaced.
	else if (buffer[c] == 'ß')
	{
	buffer[c] = 's';
	buffer.Insert(c + 1, 's');
	substCount++;
	}
	// Take care that at least one character is left left side from the current one
	if (c < buffer.Length - 1)
	{
	// Masking several common character combinations with an token
	if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
	{
	buffer[c] = '$';
	buffer.Remove(c + 1, (c + 3) - (c + 1));
	substCount = +2;
	}
	else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
	{
	buffer[c] = '§';
	buffer.Remove(c + 1, 1);
	substCount++;
	}
	else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
	{
	buffer[c] = '%';
	buffer.Remove(c + 1, 1);
	substCount++;
	}
	else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
	{
	buffer[c] = '&';
	buffer.Remove(c + 1, 1);
	substCount++;
	}
	else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
	{
	buffer[c] = '#';
	buffer.Remove(c + 1, 1);
	substCount++;
	}
	else if (buffer[c] == 's' && buffer[c + 1] == 't')
	{
	buffer[c] = '!';
	buffer.Remove(c + 1, 1);
	substCount++;
	}
	}
	}
	}

	/// <summary>
	/// Undoes the changes made by <see cref="Substitute"/>. That are character pairs and
	/// character combinations. Umlauts will remain as their corresponding vowel,
	/// as "ß" remains as "ss".
	/// </summary>
	private void Resubstitute(StringBuilder buffer)
	{
	for (int c = 0; c < buffer.Length; c++)
	{
	if (buffer[c] == '*')
	{
	char x = buffer[c - 1];
	buffer[c] = x;
	}
	else if (buffer[c] == '$')
	{
	buffer[c] = 's';
	buffer.Insert(c + 1, new char[] { 'c', 'h' }, 0, 2);
	}
	else if (buffer[c] == '§')
	{
	buffer[c] = 'c';
	buffer.Insert(c + 1, 'h');
	}
	else if (buffer[c] == '%')
	{
	buffer[c] = 'e';
	buffer.Insert(c + 1, 'i');
	}
	else if (buffer[c] == '&')
	{
	buffer[c] = 'i';
	buffer.Insert(c + 1, 'e');
	}
	else if (buffer[c] == '#')
	{
	buffer[c] = 'i';
	buffer.Insert(c + 1, 'g');
	}
	else if (buffer[c] == '!')
	{
	buffer[c] = 's';
	buffer.Insert(c + 1, 't');
	}
	}
	}
	}
	}