blob: 2a2c9b85cd9b2db010467c9c93dd372d2c2bf328 [file] [log] [blame]
using System;
using System.Globalization;
using System.Text;
namespace Lucene.Net.Analysis.De
{
// This file is encoded in UTF-8
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A stemmer for German words.
/// <para>
/// The algorithm is based on the report
/// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
/// Caumanns (joerg.caumanns at isst.fhg.de).
/// </para>
/// </summary>
public class GermanStemmer
{
/// <summary>
/// Buffer for the terms while stemming them.
/// </summary>
private readonly StringBuilder sb = new StringBuilder();
/// <summary>
/// Amount of characters that are removed with <see cref="Substitute"/> while stemming.
/// </summary>
private int substCount = 0;
private static readonly CultureInfo locale = new CultureInfo("de-DE");
/// <summary>
/// Stemms the given term to an unique <c>discriminator</c>.
/// </summary>
/// <param name="term"> The term that should be stemmed. </param>
/// <returns> Discriminator for <paramref name="term"/> </returns>
protected internal virtual string Stem(string term)
{
// Use lowercase for medium stemming.
term = locale.TextInfo.ToLower(term);
if (!IsStemmable(term))
{
return term;
}
// Reset the StringBuilder.
sb.Remove(0, sb.Length);
sb.Insert(0, term);
// Stemming starts here...
Substitute(sb);
Strip(sb);
Optimize(sb);
Resubstitute(sb);
RemoveParticleDenotion(sb);
return sb.ToString();
}
/// <summary>
/// Checks if a term could be stemmed.
/// </summary>
/// <returns> true if, and only if, the given term consists in letters. </returns>
private bool IsStemmable(string term)
{
for (int c = 0; c < term.Length; c++)
{
if (!char.IsLetter(term[c]))
{
return false;
}
}
return true;
}
/// <summary>
/// suffix stripping (stemming) on the current term. The stripping is reduced
/// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
/// from which all regular suffixes are build of. The simplification causes
/// some overstemming, and way more irregular stems, but still provides unique.
/// discriminators in the most of those cases.
/// The algorithm is context free, except of the length restrictions.
/// </summary>
private void Strip(StringBuilder buffer)
{
bool doMore = true;
while (doMore && buffer.Length > 3)
{
if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("nd", StringComparison.Ordinal))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("em", StringComparison.Ordinal))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length - (buffer.Length - 2)).Equals("er", StringComparison.Ordinal))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if (buffer[buffer.Length - 1] == 'e')
{
buffer.Remove(buffer.Length - 1, 1);
}
else if (buffer[buffer.Length - 1] == 's')
{
buffer.Remove(buffer.Length - 1, 1);
}
else if (buffer[buffer.Length - 1] == 'n')
{
buffer.Remove(buffer.Length - 1, 1);
}
// "t" occurs only as suffix of verbs.
else if (buffer[buffer.Length - 1] == 't')
{
buffer.Remove(buffer.Length - 1, 1);
}
else
{
doMore = false;
}
}
}
/// <summary>
/// Does some optimizations on the term. This optimisations are
/// contextual.
/// </summary>
private void Optimize(StringBuilder buffer)
{
// Additional step for female plurals of professions and inhabitants.
if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length - (buffer.Length - 5)).Equals("erin*", StringComparison.Ordinal))
{
buffer.Remove(buffer.Length - 1, 1);
Strip(buffer);
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
// NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
if (buffer.Length > 0 && buffer[buffer.Length - 1] == ('z'))
{
buffer[buffer.Length - 1] = 'x';
}
}
/// <summary>
/// Removes a particle denotion ("ge") from a term.
/// </summary>
private void RemoveParticleDenotion(StringBuilder buffer)
{
if (buffer.Length > 4)
{
for (int c = 0; c < buffer.Length - 3; c++)
{
if (buffer.ToString(c, 4).Equals("gege", StringComparison.Ordinal))
{
buffer.Remove(c, (c + 2) - c);
return;
}
}
}
}
/// <summary>
/// Do some substitutions for the term to reduce overstemming:
///
/// <list type="bullet">
/// <item><description>Substitute Umlauts with their corresponding vowel: äöü -> aou,
/// "ß" is substituted by "ss"</description></item>
/// <item><description>Substitute a second char of a pair of equal characters with
/// an asterisk: ?? -> ?*</description></item>
/// <item><description>Substitute some common character combinations with a token:
/// sch/ch/ei/ie/ig/st -> $/§/%/&amp;/#/!</description></item>
/// </list>
/// </summary>
private void Substitute(StringBuilder buffer)
{
substCount = 0;
for (int c = 0; c < buffer.Length; c++)
{
// Replace the second char of a pair of the equal characters with an asterisk
if (c > 0 && buffer[c] == buffer[c - 1])
{
buffer[c] = '*';
}
// Substitute Umlauts.
else if (buffer[c] == 'ä')
{
buffer[c] = 'a';
}
else if (buffer[c] == 'ö')
{
buffer[c] = 'o';
}
else if (buffer[c] == 'ü')
{
buffer[c] = 'u';
}
// Fix bug so that 'ß' at the end of a word is replaced.
else if (buffer[c] == 'ß')
{
buffer[c] = 's';
buffer.Insert(c + 1, 's');
substCount++;
}
// Take care that at least one character is left left side from the current one
if (c < buffer.Length - 1)
{
// Masking several common character combinations with an token
if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
{
buffer[c] = '$';
buffer.Remove(c + 1, (c + 3) - (c + 1));
substCount = +2;
}
else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
{
buffer[c] = '§';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
{
buffer[c] = '%';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
{
buffer[c] = '&';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
{
buffer[c] = '#';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 's' && buffer[c + 1] == 't')
{
buffer[c] = '!';
buffer.Remove(c + 1, 1);
substCount++;
}
}
}
}
/// <summary>
/// Undoes the changes made by <see cref="Substitute"/>. That are character pairs and
/// character combinations. Umlauts will remain as their corresponding vowel,
/// as "ß" remains as "ss".
/// </summary>
private void Resubstitute(StringBuilder buffer)
{
for (int c = 0; c < buffer.Length; c++)
{
if (buffer[c] == '*')
{
char x = buffer[c - 1];
buffer[c] = x;
}
else if (buffer[c] == '$')
{
buffer[c] = 's';
buffer.Insert(c + 1, new char[] { 'c', 'h' }, 0, 2);
}
else if (buffer[c] == '§')
{
buffer[c] = 'c';
buffer.Insert(c + 1, 'h');
}
else if (buffer[c] == '%')
{
buffer[c] = 'e';
buffer.Insert(c + 1, 'i');
}
else if (buffer[c] == '&')
{
buffer[c] = 'i';
buffer.Insert(c + 1, 'e');
}
else if (buffer[c] == '#')
{
buffer[c] = 'i';
buffer.Insert(c + 1, 'g');
}
else if (buffer[c] == '!')
{
buffer[c] = 's';
buffer.Insert(c + 1, 't');
}
}
}
}
}