blob: e45a67298d1a63ae695de4d9248a511a3643d228 [file] [log] [blame]
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.El
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A stemmer for Greek words, according to: <c>Development of a Stemmer for the
/// Greek Language.</c> Georgios Ntais
/// <para>
/// NOTE: Input is expected to be casefolded for Greek (including folding of final
/// sigma to sigma), and with diacritics removed. This can be achieved with
/// either <see cref="GreekLowerCaseFilter"/> or ICUFoldingFilter.
/// @lucene.experimental
/// </para>
/// </summary>
public class GreekStemmer
{
/// <summary>
/// Stems a word contained in a leading portion of a <see cref="T:char[]"/> array.
/// The word is passed through a number of rules that modify it's length.
/// </summary>
/// <param name="s"> A <see cref="T:char[]"/> array that contains the word to be stemmed. </param>
/// <param name="len"> The length of the <see cref="T:char[]"/> array. </param>
/// <returns> The new length of the stemmed word. </returns>
public virtual int Stem(char[] s, int len)
{
if (len < 4) // too short
{
return len;
}
int origLen = len;
// "short rules": if it hits one of these, it skips the "long list"
len = Rule0(s, len);
len = Rule1(s, len);
len = Rule2(s, len);
len = Rule3(s, len);
len = Rule4(s, len);
len = Rule5(s, len);
len = Rule6(s, len);
len = Rule7(s, len);
len = Rule8(s, len);
len = Rule9(s, len);
len = Rule10(s, len);
len = Rule11(s, len);
len = Rule12(s, len);
len = Rule13(s, len);
len = Rule14(s, len);
len = Rule15(s, len);
len = Rule16(s, len);
len = Rule17(s, len);
len = Rule18(s, len);
len = Rule19(s, len);
len = Rule20(s, len);
// "long list"
if (len == origLen)
{
len = Rule21(s, len);
}
return Rule22(s, len);
}
private int Rule0(char[] s, int len)
{
if (len > 9 && (StemmerUtil.EndsWith(s, len, "καθεστωτοσ") ||
StemmerUtil.EndsWith(s, len, "καθεστωτων")))
{
return len - 4;
}
if (len > 8 && (StemmerUtil.EndsWith(s, len, "γεγονοτοσ") ||
StemmerUtil.EndsWith(s, len, "γεγονοτων")))
{
return len - 4;
}
if (len > 8 && StemmerUtil.EndsWith(s, len, "καθεστωτα"))
{
return len - 3;
}
if (len > 7 && (StemmerUtil.EndsWith(s, len, "τατογιου") ||
StemmerUtil.EndsWith(s, len, "τατογιων")))
{
return len - 4;
}
if (len > 7 && StemmerUtil.EndsWith(s, len, "γεγονοτα"))
{
return len - 3;
}
if (len > 7 && StemmerUtil.EndsWith(s, len, "καθεστωσ"))
{
return len - 2;
}
if (len > 6 && (StemmerUtil.EndsWith(s, len, "σκαγιου")) ||
StemmerUtil.EndsWith(s, len, "σκαγιων") ||
StemmerUtil.EndsWith(s, len, "ολογιου") ||
StemmerUtil.EndsWith(s, len, "ολογιων") ||
StemmerUtil.EndsWith(s, len, "κρεατοσ") ||
StemmerUtil.EndsWith(s, len, "κρεατων") ||
StemmerUtil.EndsWith(s, len, "περατοσ") ||
StemmerUtil.EndsWith(s, len, "περατων") ||
StemmerUtil.EndsWith(s, len, "τερατοσ") ||
StemmerUtil.EndsWith(s, len, "τερατων"))
{
return len - 4;
}
if (len > 6 && StemmerUtil.EndsWith(s, len, "τατογια"))
{
return len - 3;
}
if (len > 6 && StemmerUtil.EndsWith(s, len, "γεγονοσ"))
{
return len - 2;
}
if (len > 5 && (StemmerUtil.EndsWith(s, len, "φαγιου") ||
StemmerUtil.EndsWith(s, len, "φαγιων") ||
StemmerUtil.EndsWith(s, len, "σογιου") ||
StemmerUtil.EndsWith(s, len, "σογιων")))
{
return len - 4;
}
if (len > 5 && (StemmerUtil.EndsWith(s, len, "σκαγια") ||
StemmerUtil.EndsWith(s, len, "ολογια") ||
StemmerUtil.EndsWith(s, len, "κρεατα") ||
StemmerUtil.EndsWith(s, len, "περατα") ||
StemmerUtil.EndsWith(s, len, "τερατα")))
{
return len - 3;
}
if (len > 4 && (StemmerUtil.EndsWith(s, len, "φαγια") ||
StemmerUtil.EndsWith(s, len, "σογια") ||
StemmerUtil.EndsWith(s, len, "φωτοσ") ||
StemmerUtil.EndsWith(s, len, "φωτων")))
{
return len - 3;
}
if (len > 4 && (StemmerUtil.EndsWith(s, len, "κρεασ") ||
StemmerUtil.EndsWith(s, len, "περασ") ||
StemmerUtil.EndsWith(s, len, "τερασ")))
{
return len - 2;
}
if (len > 3 && StemmerUtil.EndsWith(s, len, "φωτα"))
{
return len - 2;
}
if (len > 2 && StemmerUtil.EndsWith(s, len, "φωσ"))
{
return len - 1;
}
return len;
}
private int Rule1(char[] s, int len)
{
if (len > 4 && (StemmerUtil.EndsWith(s, len, "αδεσ") ||
StemmerUtil.EndsWith(s, len, "αδων")))
{
len -= 4;
if (!(StemmerUtil.EndsWith(s, len, "οκ") ||
StemmerUtil.EndsWith(s, len, "μαμ") ||
StemmerUtil.EndsWith(s, len, "μαν") ||
StemmerUtil.EndsWith(s, len, "μπαμπ") ||
StemmerUtil.EndsWith(s, len, "πατερ") ||
StemmerUtil.EndsWith(s, len, "γιαγι") ||
StemmerUtil.EndsWith(s, len, "νταντ") ||
StemmerUtil.EndsWith(s, len, "κυρ") ||
StemmerUtil.EndsWith(s, len, "θει") ||
StemmerUtil.EndsWith(s, len, "πεθερ")))
{
len += 2; // add back -αδ
}
}
return len;
}
private int Rule2(char[] s, int len)
{
if (len > 4 && (StemmerUtil.EndsWith(s, len, "εδεσ") ||
StemmerUtil.EndsWith(s, len, "εδων")))
{
len -= 4;
if (StemmerUtil.EndsWith(s, len, "οπ") ||
StemmerUtil.EndsWith(s, len, "ιπ") ||
StemmerUtil.EndsWith(s, len, "εμπ") ||
StemmerUtil.EndsWith(s, len, "υπ") ||
StemmerUtil.EndsWith(s, len, "γηπ") ||
StemmerUtil.EndsWith(s, len, "δαπ") ||
StemmerUtil.EndsWith(s, len, "κρασπ") ||
StemmerUtil.EndsWith(s, len, "μιλ"))
{
len += 2; // add back -εδ
}
}
return len;
}
private int Rule3(char[] s, int len)
{
if (len > 5 && (StemmerUtil.EndsWith(s, len, "ουδεσ") ||
StemmerUtil.EndsWith(s, len, "ουδων")))
{
len -= 5;
if (StemmerUtil.EndsWith(s, len, "αρκ") ||
StemmerUtil.EndsWith(s, len, "καλιακ") ||
StemmerUtil.EndsWith(s, len, "πεταλ") ||
StemmerUtil.EndsWith(s, len, "λιχ") ||
StemmerUtil.EndsWith(s, len, "πλεξ") ||
StemmerUtil.EndsWith(s, len, "σκ") ||
StemmerUtil.EndsWith(s, len, "σ") ||
StemmerUtil.EndsWith(s, len, "φλ") ||
StemmerUtil.EndsWith(s, len, "φρ") ||
StemmerUtil.EndsWith(s, len, "βελ") ||
StemmerUtil.EndsWith(s, len, "λουλ") ||
StemmerUtil.EndsWith(s, len, "χν") ||
StemmerUtil.EndsWith(s, len, "σπ") ||
StemmerUtil.EndsWith(s, len, "τραγ") ||
StemmerUtil.EndsWith(s, len, "φε"))
{
len += 3; // add back -ουδ
}
}
return len;
}
#pragma warning disable 612, 618
private static readonly CharArraySet exc4 = new CharArraySet(LuceneVersion.LUCENE_CURRENT, new string[] { "θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ" }, false);
#pragma warning restore 612, 618
private int Rule4(char[] s, int len)
{
if (len > 3 && (StemmerUtil.EndsWith(s, len, "εωσ") ||
StemmerUtil.EndsWith(s, len, "εων")))
{
len -= 3;
if (exc4.Contains(s, 0, len))
{
len++; // add back -ε
}
}
return len;
}
private int Rule5(char[] s, int len)
{
if (len > 2 && StemmerUtil.EndsWith(s, len, "ια"))
{
len -= 2;
if (EndsWithVowel(s, len))
{
len++; // add back -ι
}
}
else if (len > 3 && (StemmerUtil.EndsWith(s, len, "ιου") ||
StemmerUtil.EndsWith(s, len, "ιων")))
{
len -= 3;
if (EndsWithVowel(s, len))
{
len++; // add back -ι
}
}
return len;
}
private static readonly CharArraySet exc6 =
#pragma warning disable 612, 618
new CharArraySet(LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
"πετσ", "πιτσ", "πικαντ", "πλιατσ", "ποστελν", "πρωτοδ", "σερτ",
"συναδ", "τσαμ", "υποδ", "φιλον", "φυλοδ", "χασ" }, false);
private int Rule6(char[] s, int len)
{
bool removed = false;
if (len > 3 && (StemmerUtil.EndsWith(s, len, "ικα") ||
StemmerUtil.EndsWith(s, len, "ικο")))
{
len -= 3;
removed = true;
}
else if (len > 4 && (StemmerUtil.EndsWith(s, len, "ικου") ||
StemmerUtil.EndsWith(s, len, "ικων")))
{
len -= 4;
removed = true;
}
if (removed)
{
if (EndsWithVowel(s, len) || exc6.Contains(s, 0, len))
{
len += 2; // add back -ικ
}
}
return len;
}
private static readonly CharArraySet exc7 =
#pragma warning disable 612, 618
new CharArraySet(LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
"πεθ", "πικρ", "ποτ", "σιχ", "χ" }, false);
private int Rule7(char[] s, int len)
{
if (len == 5 && StemmerUtil.EndsWith(s, len, "αγαμε"))
{
return len - 1;
}
if (len > 7 && StemmerUtil.EndsWith(s, len, "ηθηκαμε"))
{
len -= 7;
}
else if (len > 6 && StemmerUtil.EndsWith(s, len, "ουσαμε"))
{
len -= 6;
}
else if (len > 5 && (StemmerUtil.EndsWith(s, len, "αγαμε") ||
StemmerUtil.EndsWith(s, len, "ησαμε") ||
StemmerUtil.EndsWith(s, len, "ηκαμε")))
{
len -= 5;
}
if (len > 3 && StemmerUtil.EndsWith(s, len, "αμε"))
{
len -= 3;
if (exc7.Contains(s, 0, len))
{
len += 2; // add back -αμ
}
}
return len;
}
private static readonly CharArraySet exc8a = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "τρ", "τσ" }, false);
private static readonly CharArraySet exc8b = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
"τσαρλατ", "ορφ", "τσιγγ", "τσοπ", "φωτοστεφ", "χ", "ψυχοπλ", "αγ",
"ορφ", "γαλ", "γερ", "δεκ", "διπλ", "αμερικαν", "ουρ", "πιθ",
"πουριτ", "σ", "ζωντ", "ικ", "καστ", "κοπ", "λιχ", "λουθηρ", "μαιντ",
"μελ", "σιγ", "σπ", "στεγ", "τραγ", "τσαγ", "φ", "ερ", "αδαπ",
"αθιγγ", "αμηχ", "ανικ", "ανοργ", "απηγ", "απιθ", "ατσιγγ", "βασ",
"βασκ", "βαθυγαλ", "βιομηχ", "βραχυκ", "διατ", "διαφ", "ενοργ",
"θυσ", "καπνοβιομηχ", "καταγαλ", "κλιβ", "κοιλαρφ", "λιβ",
"μεγλοβιομηχ", "μικροβιομηχ", "νταβ", "ξηροκλιβ", "ολιγοδαμ",
"ολογαλ", "πενταρφ", "περηφ", "περιτρ", "πλατ", "πολυδαπ", "πολυμηχ",
"στεφ", "ταβ", "τετ", "υπερηφ", "υποκοπ", "χαμηλοδαπ", "ψηλοταβ" }, false);
private int Rule8(char[] s, int len)
{
bool removed = false;
if (len > 8 && StemmerUtil.EndsWith(s, len, "ιουντανε"))
{
len -= 8;
removed = true;
}
else if (len > 7 && StemmerUtil.EndsWith(s, len, "ιοντανε") ||
StemmerUtil.EndsWith(s, len, "ουντανε") ||
StemmerUtil.EndsWith(s, len, "ηθηκανε"))
{
len -= 7;
removed = true;
}
else if (len > 6 && StemmerUtil.EndsWith(s, len, "ιοτανε") ||
StemmerUtil.EndsWith(s, len, "οντανε") ||
StemmerUtil.EndsWith(s, len, "ουσανε"))
{
len -= 6;
removed = true;
}
else if (len > 5 && StemmerUtil.EndsWith(s, len, "αγανε") ||
StemmerUtil.EndsWith(s, len, "ησανε") ||
StemmerUtil.EndsWith(s, len, "οτανε") ||
StemmerUtil.EndsWith(s, len, "ηκανε"))
{
len -= 5;
removed = true;
}
if (removed && exc8a.Contains(s, 0, len))
{
// add -αγαν (we removed > 4 chars so its safe)
len += 4;
s[len - 4] = 'α';
s[len - 3] = 'γ';
s[len - 2] = 'α';
s[len - 1] = 'ν';
}
if (len > 3 && StemmerUtil.EndsWith(s, len, "ανε"))
{
len -= 3;
if (EndsWithVowelNoY(s, len) || exc8b.Contains(s, 0, len))
{
len += 2; // add back -αν
}
}
return len;
}
private static readonly CharArraySet exc9 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ" }, false);
private int Rule9(char[] s, int len)
{
if (len > 5 && StemmerUtil.EndsWith(s, len, "ησετε"))
{
len -= 5;
}
if (len > 3 && StemmerUtil.EndsWith(s, len, "ετε"))
{
len -= 3;
if (exc9.Contains(s, 0, len) ||
EndsWithVowelNoY(s, len) ||
StemmerUtil.EndsWith(s, len, "οδ") ||
StemmerUtil.EndsWith(s, len, "αιρ") ||
StemmerUtil.EndsWith(s, len, "φορ") ||
StemmerUtil.EndsWith(s, len, "ταθ") ||
StemmerUtil.EndsWith(s, len, "διαθ") ||
StemmerUtil.EndsWith(s, len, "σχ") ||
StemmerUtil.EndsWith(s, len, "ενδ") ||
StemmerUtil.EndsWith(s, len, "ευρ") ||
StemmerUtil.EndsWith(s, len, "τιθ") ||
StemmerUtil.EndsWith(s, len, "υπερθ") ||
StemmerUtil.EndsWith(s, len, "ραθ") ||
StemmerUtil.EndsWith(s, len, "ενθ") ||
StemmerUtil.EndsWith(s, len, "ροθ") ||
StemmerUtil.EndsWith(s, len, "σθ") ||
StemmerUtil.EndsWith(s, len, "πυρ") ||
StemmerUtil.EndsWith(s, len, "αιν") ||
StemmerUtil.EndsWith(s, len, "συνδ") ||
StemmerUtil.EndsWith(s, len, "συν") ||
StemmerUtil.EndsWith(s, len, "συνθ") ||
StemmerUtil.EndsWith(s, len, "χωρ") ||
StemmerUtil.EndsWith(s, len, "πον") ||
StemmerUtil.EndsWith(s, len, "βρ") ||
StemmerUtil.EndsWith(s, len, "καθ") ||
StemmerUtil.EndsWith(s, len, "ευθ") ||
StemmerUtil.EndsWith(s, len, "εκθ") ||
StemmerUtil.EndsWith(s, len, "νετ") ||
StemmerUtil.EndsWith(s, len, "ρον") ||
StemmerUtil.EndsWith(s, len, "αρκ") ||
StemmerUtil.EndsWith(s, len, "βαρ") ||
StemmerUtil.EndsWith(s, len, "βολ") ||
StemmerUtil.EndsWith(s, len, "ωφελ"))
{
len += 2; // add back -ετ
}
}
return len;
}
private int Rule10(char[] s, int len)
{
if (len > 5 && (StemmerUtil.EndsWith(s, len, "οντασ") || StemmerUtil.EndsWith(s, len, "ωντασ")))
{
len -= 5;
if (len == 3 && StemmerUtil.EndsWith(s, len, "αρχ"))
{
len += 3; // add back *ντ
s[len - 3] = 'ο';
}
if (StemmerUtil.EndsWith(s, len, "κρε"))
{
len += 3; // add back *ντ
s[len - 3] = 'ω';
}
}
return len;
}
private int Rule11(char[] s, int len)
{
if (len > 6 && StemmerUtil.EndsWith(s, len, "ομαστε"))
{
len -= 6;
if (len == 2 && StemmerUtil.EndsWith(s, len, "ον"))
{
len += 5; // add back -ομαστ
}
}
else if (len > 7 && StemmerUtil.EndsWith(s, len, "ιομαστε"))
{
len -= 7;
if (len == 2 && StemmerUtil.EndsWith(s, len, "ον"))
{
len += 5;
s[len - 5] = 'ο';
s[len - 4] = 'μ';
s[len - 3] = 'α';
s[len - 2] = 'σ';
s[len - 1] = 'τ';
}
}
return len;
}
#pragma warning disable 612, 618
private static readonly CharArraySet exc12a = new CharArraySet(LuceneVersion.LUCENE_CURRENT,
new string[] { "π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ" }, false);
private static readonly CharArraySet exc12b = new CharArraySet(LuceneVersion.LUCENE_CURRENT,
new string[] { "αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ" }, false);
#pragma warning restore 612, 618
private int Rule12(char[] s, int len)
{
if (len > 5 && StemmerUtil.EndsWith(s, len, "ιεστε"))
{
len -= 5;
if (exc12a.Contains(s, 0, len))
{
len += 4; // add back -ιεστ
}
}
if (len > 4 && StemmerUtil.EndsWith(s, len, "εστε"))
{
len -= 4;
if (exc12b.Contains(s, 0, len))
{
len += 3; // add back -εστ
}
}
return len;
}
private static readonly CharArraySet exc13 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "διαθ", "θ", "παρακαταθ", "προσθ", "συνθ" }, false);
private int Rule13(char[] s, int len)
{
if (len > 6 && StemmerUtil.EndsWith(s, len, "ηθηκεσ"))
{
len -= 6;
}
else if (len > 5 && (StemmerUtil.EndsWith(s, len, "ηθηκα") || StemmerUtil.EndsWith(s, len, "ηθηκε")))
{
len -= 5;
}
bool removed = false;
if (len > 4 && StemmerUtil.EndsWith(s, len, "ηκεσ"))
{
len -= 4;
removed = true;
}
else if (len > 3 && (StemmerUtil.EndsWith(s, len, "ηκα") || StemmerUtil.EndsWith(s, len, "ηκε")))
{
len -= 3;
removed = true;
}
if (removed && (exc13.Contains(s, 0, len) ||
StemmerUtil.EndsWith(s, len, "σκωλ") ||
StemmerUtil.EndsWith(s, len, "σκουλ") ||
StemmerUtil.EndsWith(s, len, "ναρθ") ||
StemmerUtil.EndsWith(s, len, "σφ") ||
StemmerUtil.EndsWith(s, len, "οθ") ||
StemmerUtil.EndsWith(s, len, "πιθ")))
{
len += 2; // add back the -ηκ
}
return len;
}
private static readonly CharArraySet exc14 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε", "τσα" }, false);
private int Rule14(char[] s, int len)
{
bool removed = false;
if (len > 5 && StemmerUtil.EndsWith(s, len, "ουσεσ"))
{
len -= 5;
removed = true;
}
else if (len > 4 && (StemmerUtil.EndsWith(s, len, "ουσα") ||
StemmerUtil.EndsWith(s, len, "ουσε")))
{
len -= 4;
removed = true;
}
if (removed && (exc14.Contains(s, 0, len) ||
EndsWithVowel(s, len) ||
StemmerUtil.EndsWith(s, len, "ποδαρ") ||
StemmerUtil.EndsWith(s, len, "βλεπ") ||
StemmerUtil.EndsWith(s, len, "πανταχ") ||
StemmerUtil.EndsWith(s, len, "φρυδ") ||
StemmerUtil.EndsWith(s, len, "μαντιλ") ||
StemmerUtil.EndsWith(s, len, "μαλλ") ||
StemmerUtil.EndsWith(s, len, "κυματ") ||
StemmerUtil.EndsWith(s, len, "λαχ") ||
StemmerUtil.EndsWith(s, len, "ληγ") ||
StemmerUtil.EndsWith(s, len, "φαγ") ||
StemmerUtil.EndsWith(s, len, "ομ") ||
StemmerUtil.EndsWith(s, len, "πρωτ")))
{
len += 3; // add back -ουσ
}
return len;
}
private static readonly CharArraySet exc15a = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
"συντ", "τ", "υποτ", "χαρ", "αειπ", "αιμοστ", "ανυπ", "αποτ",
"αρτιπ", "διατ", "εν", "επιτ", "κροκαλοπ", "σιδηροπ", "λ", "ναυ",
"ουλαμ", "ουρ", "π", "τρ", "μ" }, false);
private static readonly CharArraySet exc15b = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "ψοφ", "ναυλοχ" }, false);
private int Rule15(char[] s, int len)
{
bool removed = false;
if (len > 4 && StemmerUtil.EndsWith(s, len, "αγεσ"))
{
len -= 4;
removed = true;
}
else if (len > 3 && (StemmerUtil.EndsWith(s, len, "αγα") || StemmerUtil.EndsWith(s, len, "αγε")))
{
len -= 3;
removed = true;
}
if (removed)
{
bool cond1 = exc15a.Contains(s, 0, len) ||
StemmerUtil.EndsWith(s, len, "οφ") ||
StemmerUtil.EndsWith(s, len, "πελ") ||
StemmerUtil.EndsWith(s, len, "χορτ") ||
StemmerUtil.EndsWith(s, len, "λλ") ||
StemmerUtil.EndsWith(s, len, "σφ") ||
StemmerUtil.EndsWith(s, len, "ρπ") ||
StemmerUtil.EndsWith(s, len, "φρ") ||
StemmerUtil.EndsWith(s, len, "πρ") ||
StemmerUtil.EndsWith(s, len, "λοχ") ||
StemmerUtil.EndsWith(s, len, "σμην");
bool cond2 = exc15b.Contains(s, 0, len) || StemmerUtil.EndsWith(s, len, "κολλ");
if (cond1 && !cond2)
{
len += 2; // add back -αγ
}
}
return len;
}
private static readonly CharArraySet exc16 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν" }, false);
private int Rule16(char[] s, int len)
{
bool removed = false;
if (len > 4 && StemmerUtil.EndsWith(s, len, "ησου"))
{
len -= 4;
removed = true;
}
else if (len > 3 && (StemmerUtil.EndsWith(s, len, "ησε") || StemmerUtil.EndsWith(s, len, "ησα")))
{
len -= 3;
removed = true;
}
if (removed && exc16.Contains(s, 0, len))
{
len += 2; // add back -ησ
}
return len;
}
private static readonly CharArraySet exc17 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ" }, false);
private int Rule17(char[] s, int len)
{
if (len > 4 && StemmerUtil.EndsWith(s, len, "ηστε"))
{
len -= 4;
if (exc17.Contains(s, 0, len))
{
len += 3; // add back the -ηστ
}
}
return len;
}
private static readonly CharArraySet exc18 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων" }, false);
private int Rule18(char[] s, int len)
{
bool removed = false;
if (len > 6 && (StemmerUtil.EndsWith(s, len, "ησουνε") || StemmerUtil.EndsWith(s, len, "ηθουνε")))
{
len -= 6;
removed = true;
}
else if (len > 4 && StemmerUtil.EndsWith(s, len, "ουνε"))
{
len -= 4;
removed = true;
}
if (removed && exc18.Contains(s, 0, len))
{
len += 3;
s[len - 3] = 'ο';
s[len - 2] = 'υ';
s[len - 1] = 'ν';
}
return len;
}
private static readonly CharArraySet exc19 = new CharArraySet(
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT,
#pragma warning restore 612, 618
new string[] { "παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ" }, false);
private int Rule19(char[] s, int len)
{
bool removed = false;
if (len > 6 && (StemmerUtil.EndsWith(s, len, "ησουμε") || StemmerUtil.EndsWith(s, len, "ηθουμε")))
{
len -= 6;
removed = true;
}
else if (len > 4 && StemmerUtil.EndsWith(s, len, "ουμε"))
{
len -= 4;
removed = true;
}
if (removed && exc19.Contains(s, 0, len))
{
len += 3;
s[len - 3] = 'ο';
s[len - 2] = 'υ';
s[len - 1] = 'μ';
}
return len;
}
private int Rule20(char[] s, int len)
{
if (len > 5 && (StemmerUtil.EndsWith(s, len, "ματων") || StemmerUtil.EndsWith(s, len, "ματοσ")))
{
len -= 3;
}
else if (len > 4 && StemmerUtil.EndsWith(s, len, "ματα"))
{
len -= 2;
}
return len;
}
private int Rule21(char[] s, int len)
{
if (len > 9 && StemmerUtil.EndsWith(s, len, "ιοντουσαν"))
{
return len - 9;
}
if (len > 8 && (StemmerUtil.EndsWith(s, len, "ιομασταν") ||
StemmerUtil.EndsWith(s, len, "ιοσασταν") ||
StemmerUtil.EndsWith(s, len, "ιουμαστε") ||
StemmerUtil.EndsWith(s, len, "οντουσαν")))
{
return len - 8;
}
if (len > 7 && (StemmerUtil.EndsWith(s, len, "ιεμαστε") ||
StemmerUtil.EndsWith(s, len, "ιεσαστε") ||
StemmerUtil.EndsWith(s, len, "ιομουνα") ||
StemmerUtil.EndsWith(s, len, "ιοσαστε") ||
StemmerUtil.EndsWith(s, len, "ιοσουνα") ||
StemmerUtil.EndsWith(s, len, "ιουνται") ||
StemmerUtil.EndsWith(s, len, "ιουνταν") ||
StemmerUtil.EndsWith(s, len, "ηθηκατε") ||
StemmerUtil.EndsWith(s, len, "ομασταν") ||
StemmerUtil.EndsWith(s, len, "οσασταν") ||
StemmerUtil.EndsWith(s, len, "ουμαστε")))
{
return len - 7;
}
if (len > 6 && (StemmerUtil.EndsWith(s, len, "ιομουν") ||
StemmerUtil.EndsWith(s, len, "ιονταν") ||
StemmerUtil.EndsWith(s, len, "ιοσουν") ||
StemmerUtil.EndsWith(s, len, "ηθειτε") ||
StemmerUtil.EndsWith(s, len, "ηθηκαν") ||
StemmerUtil.EndsWith(s, len, "ομουνα") ||
StemmerUtil.EndsWith(s, len, "οσαστε") ||
StemmerUtil.EndsWith(s, len, "οσουνα") ||
StemmerUtil.EndsWith(s, len, "ουνται") ||
StemmerUtil.EndsWith(s, len, "ουνταν") ||
StemmerUtil.EndsWith(s, len, "ουσατε")))
{
return len - 6;
}
if (len > 5 && (StemmerUtil.EndsWith(s, len, "αγατε") ||
StemmerUtil.EndsWith(s, len, "ιεμαι") ||
StemmerUtil.EndsWith(s, len, "ιεται") ||
StemmerUtil.EndsWith(s, len, "ιεσαι") ||
StemmerUtil.EndsWith(s, len, "ιοταν") ||
StemmerUtil.EndsWith(s, len, "ιουμα") ||
StemmerUtil.EndsWith(s, len, "ηθεισ") ||
StemmerUtil.EndsWith(s, len, "ηθουν") ||
StemmerUtil.EndsWith(s, len, "ηκατε") ||
StemmerUtil.EndsWith(s, len, "ησατε") ||
StemmerUtil.EndsWith(s, len, "ησουν") ||
StemmerUtil.EndsWith(s, len, "ομουν") ||
StemmerUtil.EndsWith(s, len, "ονται") ||
StemmerUtil.EndsWith(s, len, "ονταν") ||
StemmerUtil.EndsWith(s, len, "οσουν") ||
StemmerUtil.EndsWith(s, len, "ουμαι") ||
StemmerUtil.EndsWith(s, len, "ουσαν")))
{
return len - 5;
}
if (len > 4 && (StemmerUtil.EndsWith(s, len, "αγαν") ||
StemmerUtil.EndsWith(s, len, "αμαι") ||
StemmerUtil.EndsWith(s, len, "ασαι") ||
StemmerUtil.EndsWith(s, len, "αται") ||
StemmerUtil.EndsWith(s, len, "ειτε") ||
StemmerUtil.EndsWith(s, len, "εσαι") ||
StemmerUtil.EndsWith(s, len, "εται") ||
StemmerUtil.EndsWith(s, len, "ηδεσ") ||
StemmerUtil.EndsWith(s, len, "ηδων") ||
StemmerUtil.EndsWith(s, len, "ηθει") ||
StemmerUtil.EndsWith(s, len, "ηκαν") ||
StemmerUtil.EndsWith(s, len, "ησαν") ||
StemmerUtil.EndsWith(s, len, "ησει") ||
StemmerUtil.EndsWith(s, len, "ησεσ") ||
StemmerUtil.EndsWith(s, len, "ομαι") ||
StemmerUtil.EndsWith(s, len, "οταν")))
{
return len - 4;
}
if (len > 3 && (StemmerUtil.EndsWith(s, len, "αει") ||
StemmerUtil.EndsWith(s, len, "εισ") ||
StemmerUtil.EndsWith(s, len, "ηθω") ||
StemmerUtil.EndsWith(s, len, "ησω") ||
StemmerUtil.EndsWith(s, len, "ουν") ||
StemmerUtil.EndsWith(s, len, "ουσ")))
{
return len - 3;
}
if (len > 2 && (StemmerUtil.EndsWith(s, len, "αν") ||
StemmerUtil.EndsWith(s, len, "ασ") ||
StemmerUtil.EndsWith(s, len, "αω") ||
StemmerUtil.EndsWith(s, len, "ει") ||
StemmerUtil.EndsWith(s, len, "εσ") ||
StemmerUtil.EndsWith(s, len, "ησ") ||
StemmerUtil.EndsWith(s, len, "οι") ||
StemmerUtil.EndsWith(s, len, "οσ") ||
StemmerUtil.EndsWith(s, len, "ου") ||
StemmerUtil.EndsWith(s, len, "υσ") ||
StemmerUtil.EndsWith(s, len, "ων")))
{
return len - 2;
}
if (len > 1 && EndsWithVowel(s, len))
{
return len - 1;
}
return len;
}
private int Rule22(char[] s, int len)
{
if (StemmerUtil.EndsWith(s, len, "εστερ") ||
StemmerUtil.EndsWith(s, len, "εστατ"))
{
return len - 5;
}
if (StemmerUtil.EndsWith(s, len, "οτερ") ||
StemmerUtil.EndsWith(s, len, "οτατ") ||
StemmerUtil.EndsWith(s, len, "υτερ") ||
StemmerUtil.EndsWith(s, len, "υτατ") ||
StemmerUtil.EndsWith(s, len, "ωτερ") ||
StemmerUtil.EndsWith(s, len, "ωτατ"))
{
return len - 4;
}
return len;
}
/// <summary>
/// Checks if the word contained in the leading portion of char[] array ,
/// ends with the suffix given as parameter.
/// </summary>
/// <param name="s"> A char[] array that represents a word. </param>
/// <param name="len"> The length of the char[] array. </param>
/// <param name="suffix"> A <see cref="string"/> object to check if the word given ends with these characters. </param>
/// <returns> True if the word ends with the suffix given , false otherwise. </returns>
private bool EndsWith(char[] s, int len, string suffix)
{
int suffixLen = suffix.Length;
if (suffixLen > len)
{
return false;
}
for (int i = suffixLen - 1; i >= 0; i--)
{
if (s[len - (suffixLen - i)] != suffix[i])
{
return false;
}
}
return true;
}
/// <summary>
/// Checks if the word contained in the leading portion of <see cref="T:char[]"/> array ,
/// ends with a Greek vowel.
/// </summary>
/// <param name="s"> A <see cref="T:char[]"/> array that represents a word. </param>
/// <param name="len"> The length of the <see cref="T:char[]"/> array. </param>
/// <returns> True if the word contained in the leading portion of <see cref="T:char[]"/> array ,
/// ends with a vowel , false otherwise. </returns>
private bool EndsWithVowel(char[] s, int len)
{
if (len == 0)
{
return false;
}
switch (s[len - 1])
{
case 'α':
case 'ε':
case 'η':
case 'ι':
case 'ο':
case 'υ':
case 'ω':
return true;
default:
return false;
}
}
/// <summary>
/// Checks if the word contained in the leading portion of <see cref="T:char[]"/> array ,
/// ends with a Greek vowel.
/// </summary>
/// <param name="s"> A <see cref="T:char[]"/> array that represents a word. </param>
/// <param name="len"> The length of the <see cref="T:char[]"/> array. </param>
/// <returns> True if the word contained in the leading portion of <see cref="T:char[]"/> array ,
/// ends with a vowel , false otherwise. </returns>
private bool EndsWithVowelNoY(char[] s, int len)
{
if (len == 0)
{
return false;
}
switch (s[len - 1])
{
case 'α':
case 'ε':
case 'η':
case 'ι':
case 'ο':
case 'ω':
return true;
default:
return false;
}
}
}
}