blob: 5c2e9ae52a5bd4253164aac3e696fe5d7d92b873 [file] [log] [blame]
using Lucene.Net.Analysis.Util;
using System.IO;
namespace Lucene.Net.Analysis.Id
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Stemmer for Indonesian.
/// <para>
/// Stems Indonesian words with the algorithm presented in:
/// <i>A Study of Stemming Effects on Information Retrieval in
/// Bahasa Indonesia</i>, Fadillah Z Tala.
/// http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
/// </para>
/// </summary>
public class IndonesianStemmer
{
private int numSyllables;
private int flags;
private const int REMOVED_KE = 1;
private const int REMOVED_PENG = 2;
private const int REMOVED_DI = 4;
private const int REMOVED_MENG = 8;
private const int REMOVED_TER = 16;
private const int REMOVED_BER = 32;
private const int REMOVED_PE = 64;
/// <summary>
/// Stem a term (returning its new length).
/// <para>
/// Use <code>stemDerivational</code> to control whether full stemming
/// or only light inflectional stemming is done.
/// </para>
/// </summary>
public virtual int stem(char[] text, int length, bool stemDerivational)
{
flags = 0;
numSyllables = 0;
for (int i = 0; i < length; i++)
{
if (isVowel(text[i]))
{
numSyllables++;
}
}
if (numSyllables > 2)
{
length = removeParticle(text, length);
}
if (numSyllables > 2)
{
length = removePossessivePronoun(text, length);
}
if (stemDerivational)
{
length = StemDerivational(text, length);
}
return length;
}
private int StemDerivational(char[] text, int length)
{
int oldLength = length;
if (numSyllables > 2)
{
length = removeFirstOrderPrefix(text, length);
}
if (oldLength != length) // a rule is fired
{
oldLength = length;
if (numSyllables > 2)
{
length = removeSuffix(text, length);
}
if (oldLength != length) // a rule is fired
{
if (numSyllables > 2)
{
length = removeSecondOrderPrefix(text, length);
}
}
} // fail
else
{
if (numSyllables > 2)
{
length = removeSecondOrderPrefix(text, length);
}
if (numSyllables > 2)
{
length = removeSuffix(text, length);
}
}
return length;
}
private bool isVowel(char ch)
{
switch (ch)
{
case 'a':
case 'e':
case 'i':
case 'o':
case 'u':
return true;
default:
return false;
}
}
private int removeParticle(char[] text, int length)
{
if (StemmerUtil.EndsWith(text, length, "kah") || StemmerUtil.EndsWith(text, length, "lah") || StemmerUtil.EndsWith(text, length, "pun"))
{
numSyllables--;
return length - 3;
}
return length;
}
private int removePossessivePronoun(char[] text, int length)
{
if (StemmerUtil.EndsWith(text, length, "ku") || StemmerUtil.EndsWith(text, length, "mu"))
{
numSyllables--;
return length - 2;
}
if (StemmerUtil.EndsWith(text, length, "nya"))
{
numSyllables--;
return length - 3;
}
return length;
}
private int removeFirstOrderPrefix(char[] text, int length)
{
if (StemmerUtil.StartsWith(text, length, "meng"))
{
flags |= REMOVED_MENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 4);
}
if (StemmerUtil.StartsWith(text, length, "meny") && length > 4 && isVowel(text[4]))
{
flags |= REMOVED_MENG;
text[3] = 's';
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "men"))
{
flags |= REMOVED_MENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "mem"))
{
flags |= REMOVED_MENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "me"))
{
flags |= REMOVED_MENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
if (StemmerUtil.StartsWith(text, length, "peng"))
{
flags |= REMOVED_PENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 4);
}
if (StemmerUtil.StartsWith(text, length, "peny") && length > 4 && isVowel(text[4]))
{
flags |= REMOVED_PENG;
text[3] = 's';
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "peny"))
{
flags |= REMOVED_PENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 4);
}
if (StemmerUtil.StartsWith(text, length, "pen") && length > 3 && isVowel(text[3]))
{
flags |= REMOVED_PENG;
text[2] = 't';
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
if (StemmerUtil.StartsWith(text, length, "pen"))
{
flags |= REMOVED_PENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "pem"))
{
flags |= REMOVED_PENG;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "di"))
{
flags |= REMOVED_DI;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
if (StemmerUtil.StartsWith(text, length, "ter"))
{
flags |= REMOVED_TER;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "ke"))
{
flags |= REMOVED_KE;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
return length;
}
private int removeSecondOrderPrefix(char[] text, int length)
{
if (StemmerUtil.StartsWith(text, length, "ber"))
{
flags |= REMOVED_BER;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (length == 7 && StemmerUtil.StartsWith(text, length, "belajar"))
{
flags |= REMOVED_BER;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "be") && length > 4 && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r')
{
flags |= REMOVED_BER;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
if (StemmerUtil.StartsWith(text, length, "per"))
{
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (length == 7 && StemmerUtil.StartsWith(text, length, "pelajar"))
{
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 3);
}
if (StemmerUtil.StartsWith(text, length, "pe"))
{
flags |= REMOVED_PE;
numSyllables--;
return StemmerUtil.DeleteN(text, 0, length, 2);
}
return length;
}
private int removeSuffix(char[] text, int length)
{
if (StemmerUtil.EndsWith(text, length, "kan") && (flags & REMOVED_KE) == 0 && (flags & REMOVED_PENG) == 0 && (flags & REMOVED_PE) == 0)
{
numSyllables--;
return length - 3;
}
if (StemmerUtil.EndsWith(text, length, "an") && (flags & REMOVED_DI) == 0 && (flags & REMOVED_MENG) == 0 && (flags & REMOVED_TER) == 0)
{
numSyllables--;
return length - 2;
}
if (StemmerUtil.EndsWith(text, length, "i") && !StemmerUtil.EndsWith(text, length, "si") && (flags & REMOVED_BER) == 0 && (flags & REMOVED_KE) == 0 && (flags & REMOVED_PENG) == 0)
{
numSyllables--;
return length - 1;
}
return length;
}
}
}