Added PersianStemmer (#571)
Added changes based on apache/lucene#540 and https://issues.apache.org/jira/browse/LUCENE-10312
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs
index 4426a06..c4fcef0 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs
@@ -1,6 +1,7 @@
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Ar;
using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
@@ -80,6 +81,8 @@
}
}
+ private readonly CharArraySet stemExclusionSet;
+
/// <summary>
/// Builds an analyzer with the default stop words:
/// <see cref="DEFAULT_STOPWORD_FILE"/>.
@@ -97,8 +100,25 @@
/// <param name="stopwords">
/// a stopword set </param>
public PersianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+ /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
+ /// <see cref="PersianStemFilter"/>.
+ /// </summary>
+ /// <param name="matchVersion">
+ /// lucene compatibility version </param>
+ /// <param name="stopwords">
+ /// a stopword set </param>
+ /// <param name="stemExclusionSet">
+ /// a set of terms not to be stemmed </param>
+ public PersianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
: base(matchVersion, stopwords)
{
+ this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
}
/// <summary>
@@ -133,7 +153,12 @@
* the order here is important: the stopword list is normalized with the
* above!
*/
- return new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords));
+ result = new StopFilter(m_matchVersion, result, m_stopwords);
+ if (stemExclusionSet.Count > 0)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
+ return new TokenStreamComponents(source, new PersianStemFilter(result));
}
/// <summary>
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs
new file mode 100644
index 0000000..1a2dc6d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs
@@ -0,0 +1,61 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.TokenAttributes;
+
+namespace Lucene.Net.Analysis.Fa
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <see cref="TokenFilter"/> that applies <see cref="PersianStemmer"/> to stem Arabic words..
+ /// <para/>
+ /// To prevent terms from being stemmed use an instance of
+ /// <see cref="Miscellaneous.SetKeywordMarkerFilter"/> or a custom <see cref="TokenFilter"/> that sets
+ /// the <see cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </summary>
+ /// <seealso cref="Miscellaneous.SetKeywordMarkerFilter"/>
+ public sealed class PersianStemFilter : TokenFilter
+ {
+ private readonly PersianStemmer stemmer = new PersianStemmer();
+ private readonly ICharTermAttribute termAtt;
+ private readonly IKeywordAttribute keywordAttr;
+
+ public PersianStemFilter(TokenStream input)
+ : base(input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ keywordAttr = AddAttribute<IKeywordAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (m_input.IncrementToken())
+ {
+ if (!keywordAttr.IsKeyword)
+ {
+ int newlen = stemmer.Stem(termAtt.Buffer, termAtt.Length);
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs
new file mode 100644
index 0000000..9a714bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs
@@ -0,0 +1,56 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Fa
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Factory for <see cref="PersianStemFilter"/>.
+ /// <code>
+ /// <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.PersianNormalizationFilterFactory"/>
+ /// <filter class="solr.PersianStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType>
+ /// </code>
+ /// </summary>
+ public class PersianStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new <see cref="PersianStemFilterFactory"/> </summary>
+ public PersianStemFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args));
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new PersianStemFilter(input);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs
new file mode 100644
index 0000000..4e99cc5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs
@@ -0,0 +1,118 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+using JCG = J2N.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Fa
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Stemmer for Persian.
+ /// <para/>
+ /// Stemming is done in-place for efficiency, operating on a termbuffer.
+ /// <para/>
+ /// Stemming is defined as:
+ /// <list type="bullet">
+ /// <item><description> Removal of attached definite article, conjunction, and prepositions.</description></item>
+ /// <item><description> Stemming of common suffixes.</description></item>
+ /// </list>
+ /// </summary>
+ public class PersianStemmer
+ {
+ private const char ALEF = '\u0627';
+ private const char HEH = '\u0647';
+ private const char TEH = '\u062A';
+ private const char REH = '\u0631';
+ private const char NOON = '\u0646';
+ private const char YEH = '\u064A';
+ private const char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character
+
+ // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+ private static IList<char[]> Suffixes { get; } = InitializeSuffix();
+
+ private static IList<char[]> InitializeSuffix()
+ {
+ return new JCG.List<char[]>(){
+ ("" + ALEF + TEH).ToCharArray(),
+ ("" + ALEF + NOON).ToCharArray(),
+ ("" + TEH + REH + YEH + NOON).ToCharArray(),
+ ("" + TEH + REH).ToCharArray(),
+ ("" + YEH + YEH).ToCharArray(),
+ ("" + YEH).ToCharArray(),
+ ("" + HEH + ALEF).ToCharArray(),
+ ("" + ZWNJ).ToCharArray()
+ };
+ }
+
+ /// <summary>
+ /// Stem an input buffer of Persian text.
+ /// </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <returns> length of input buffer after normalization </returns>
+ public virtual int Stem(char[] s, int len)
+ {
+ len = StemSuffix(s, len);
+
+ return len;
+ }
+
+ /// <summary>
+ /// Stem suffix(es) off an Persian word. </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <returns> new length of input buffer after stemming </returns>
+ private int StemSuffix(char[] s, int len)
+ {
+ foreach (var suffix in Suffixes)
+ {
+ if (EndsWithCheckLength(s, len, suffix))
+ {
+ len = StemmerUtil.DeleteN(s, len - suffix.Length, len, suffix.Length);
+ }
+ }
+ return len;
+ }
+
+ /// <summary>
+ /// Returns true if the suffix matches and can be stemmed </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <param name="suffix"> suffix to check </param>
+ /// <returns> true if the suffix matches and can be stemmed </returns>
+ internal virtual bool EndsWithCheckLength(char[] s, int len, char[] suffix)
+ {
+ if (len < suffix.Length + 2) // all suffixes require at least 2 characters after stemming
+ {
+ return false;
+ }
+ else
+ {
+ for (int i = 0; i < suffix.Length; i++)
+ {
+ if (s[len - suffix.Length + i] != suffix[i])
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs
new file mode 100644
index 0000000..f591632
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs
@@ -0,0 +1,118 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Fa
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Test the Persian Normalization Filter
+ ///
+ /// </summary>
+
+ public class TestPersianStemFilter : BaseTokenStreamTestCase
+ {
+ internal PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
+
+ [Test]
+ public virtual void TestAnSuffix()
+ {
+ CheckOneTerm(a, "دوستان", "دوست");
+ }
+
+ [Test]
+ public virtual void TestHaSuffix()
+ {
+ CheckOneTerm(a, "كتابها", "كتاب");
+ }
+
+ [Test]
+ public virtual void TestAtSuffix()
+ {
+ CheckOneTerm(a, "جامدات", "جامد");
+ }
+
+ [Test]
+ public virtual void TestYeeSuffix()
+ {
+ CheckOneTerm(a, "عليرضايي", "عليرضا");
+ }
+
+ [Test]
+ public virtual void TestYeSuffix()
+ {
+ CheckOneTerm(a, "شادماني", "شادمان");
+ }
+
+ [Test]
+ public virtual void TestTarSuffix()
+ {
+ CheckOneTerm(a, "باحالتر", "باحال");
+ }
+
+ [Test]
+ public virtual void TestTarinSuffix()
+ {
+ CheckOneTerm(a, "خوبترين", "خوب");
+ }
+
+ [Test]
+ public virtual void TestShouldntStem()
+ {
+ CheckOneTerm(a, "كباب", "كباب");
+ }
+
+ [Test]
+ public virtual void TestNonArabic()
+ {
+ CheckOneTerm(a, "English", "english");
+ }
+
+
+ [Test]
+ public virtual void TestWithKeywordAttribute()
+ {
+ CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+ set.Add("ساهدهات");
+#pragma warning disable 612, 618
+ StandardTokenizer tokenStream = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader("ساهدهات"));
+#pragma warning restore 612, 618
+
+ PersianStemFilter filter = new PersianStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
+ AssertTokenStreamContents(filter, new string[] { "ساهدهات" });
+ }
+
+ [Test]
+ public virtual void TestEmptyTerm()
+ {
+ Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+ {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new PersianStemFilter(tokenizer));
+ });
+ CheckOneTerm(a, "", "");
+ a.Dispose();
+ }
+
+ }
+}