Added PersianStemmer (#571) Added changes based on apache/lucene#540 and https://issues.apache.org/jira/browse/LUCENE-10312

commit: c7ab459bdfc076fc12f7cba296f9a6ebff1bc2c9 [log] [tgz]
author: Ramin ALirezaee <raminmjj@users.noreply.github.com> Sun May 22 17:48:26 2022 +0430
committer: GitHub <noreply@github.com> Sun May 22 20:18:26 2022 +0700
tree: a62200b11c688dbec71a9f5891d05cd02f271b61
parent: a5c0b995cf0de3f3d7d19265728f55ef366e5a61 [diff]
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs
index 4426a06..c4fcef0 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianAnalyzer.cs

@@ -1,6 +1,7 @@
 // Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Ar;
 using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
@@ -80,6 +81,8 @@
             }
         }
 
+        private readonly CharArraySet stemExclusionSet;
+
         /// <summary>
         /// Builds an analyzer with the default stop words:
         /// <see cref="DEFAULT_STOPWORD_FILE"/>.
@@ -97,8 +100,25 @@
         /// <param name="stopwords">
         ///          a stopword set </param>
         public PersianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
+              : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+        {
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+        /// provided this analyzer will add a <see cref="SetKeywordMarkerFilter"/> before
+        /// <see cref="PersianStemFilter"/>.
+        /// </summary>
+        /// <param name="matchVersion">
+        ///          lucene compatibility version </param>
+        /// <param name="stopwords">
+        ///          a stopword set </param>
+        /// <param name="stemExclusionSet">
+        ///          a set of terms not to be stemmed </param>
+        public PersianAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet)
               : base(matchVersion, stopwords)
         {
+            this.stemExclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionSet));
         }
 
         /// <summary>
@@ -133,7 +153,12 @@
              * the order here is important: the stopword list is normalized with the
              * above!
              */
-            return new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords));
+            result = new StopFilter(m_matchVersion, result, m_stopwords);
+            if (stemExclusionSet.Count > 0)
+            {
+                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+            }
+            return new TokenStreamComponents(source, new PersianStemFilter(result));
         }
 
         /// <summary>

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs
new file mode 100644
index 0000000..1a2dc6d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilter.cs

@@ -0,0 +1,61 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.TokenAttributes;
+
+namespace Lucene.Net.Analysis.Fa
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A <see cref="TokenFilter"/> that applies <see cref="PersianStemmer"/> to stem Arabic words..
+    /// <para/>
+    /// To prevent terms from being stemmed use an instance of
+    /// <see cref="Miscellaneous.SetKeywordMarkerFilter"/> or a custom <see cref="TokenFilter"/> that sets
+    /// the <see cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+    /// </summary>
+    /// <seealso cref="Miscellaneous.SetKeywordMarkerFilter"/>
+    public sealed class PersianStemFilter : TokenFilter
+    {
+        private readonly PersianStemmer stemmer = new PersianStemmer();
+        private readonly ICharTermAttribute termAtt;
+        private readonly IKeywordAttribute keywordAttr;
+
+        public PersianStemFilter(TokenStream input)
+              : base(input)
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            keywordAttr = AddAttribute<IKeywordAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (m_input.IncrementToken())
+            {
+                if (!keywordAttr.IsKeyword)
+                {
+                    int newlen = stemmer.Stem(termAtt.Buffer, termAtt.Length);
+                    termAtt.Length = newlen;
+                }
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs
new file mode 100644
index 0000000..9a714bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemFilterFactory.cs

@@ -0,0 +1,56 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Fa
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="PersianStemFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.PersianNormalizationFilterFactory"/&gt;
+    ///     &lt;filter class="solr.PersianStemFilterFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;
+    /// </code>
+    /// </summary>
+    public class PersianStemFilterFactory : TokenFilterFactory
+    {
+
+        /// <summary>
+        /// Creates a new <see cref="PersianStemFilterFactory"/> </summary>
+        public PersianStemFilterFactory(IDictionary<string, string> args)
+              : base(args)
+        {
+            if (args.Count > 0)
+            {
+                throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args));
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new PersianStemFilter(input);
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs
new file mode 100644
index 0000000..4e99cc5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Fa/PersianStemmer.cs

@@ -0,0 +1,118 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+using JCG = J2N.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Fa
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Stemmer for Persian.
+    /// <para/>
+    /// Stemming is done in-place for efficiency, operating on a termbuffer.
+    /// <para/>
+    /// Stemming is defined as:
+    /// <list type="bullet">
+    ///     <item><description> Removal of attached definite article, conjunction, and prepositions.</description></item>
+    ///     <item><description> Stemming of common suffixes.</description></item>
+    /// </list>
+    /// </summary>
+    public class PersianStemmer
+    {
+        private const char ALEF = '\u0627';
+        private const char HEH = '\u0647';
+        private const char TEH = '\u062A';
+        private const char REH = '\u0631';
+        private const char NOON = '\u0646';
+        private const char YEH = '\u064A';
+        private const char ZWNJ = '\u200c'; // ZERO WIDTH NON-JOINER character
+
+        // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+        private static IList<char[]> Suffixes { get; } = InitializeSuffix();
+
+        private static IList<char[]> InitializeSuffix()
+        {
+            return new JCG.List<char[]>(){
+                ("" + ALEF + TEH).ToCharArray(),
+                ("" + ALEF + NOON).ToCharArray(),
+                ("" + TEH + REH + YEH + NOON).ToCharArray(),
+                ("" + TEH + REH).ToCharArray(),
+                ("" + YEH + YEH).ToCharArray(),
+                ("" + YEH).ToCharArray(),
+                ("" + HEH + ALEF).ToCharArray(),
+                ("" + ZWNJ).ToCharArray()
+            };
+        }
+
+        /// <summary>
+        /// Stem an input buffer of Persian text.
+        /// </summary>
+        /// <param name="s"> input buffer </param>
+        /// <param name="len"> length of input buffer </param>
+        /// <returns> length of input buffer after normalization </returns>
+        public virtual int Stem(char[] s, int len)
+        {
+            len = StemSuffix(s, len);
+
+            return len;
+        }
+
+        /// <summary>
+        /// Stem suffix(es) off an Persian word. </summary>
+        /// <param name="s"> input buffer </param>
+        /// <param name="len"> length of input buffer </param>
+        /// <returns> new length of input buffer after stemming </returns>
+        private int StemSuffix(char[] s, int len)
+        {
+            foreach (var suffix in Suffixes)
+            {
+                if (EndsWithCheckLength(s, len, suffix))
+                {
+                    len = StemmerUtil.DeleteN(s, len - suffix.Length, len, suffix.Length);
+                }
+            }
+            return len;
+        }
+
+        /// <summary>
+        /// Returns true if the suffix matches and can be stemmed </summary>
+        /// <param name="s"> input buffer </param>
+        /// <param name="len"> length of input buffer </param>
+        /// <param name="suffix"> suffix to check </param>
+        /// <returns> true if the suffix matches and can be stemmed </returns>
+        internal virtual bool EndsWithCheckLength(char[] s, int len, char[] suffix)
+        {
+            if (len < suffix.Length + 2) // all suffixes require at least 2 characters after stemming
+            {
+                return false;
+            }
+            else
+            {
+                for (int i = 0; i < suffix.Length; i++)
+                {
+                    if (s[len - suffix.Length + i] != suffix[i])
+                    {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        }
+    }
+}
\ No newline at end of file

diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs
new file mode 100644
index 0000000..f591632
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianStemFilter.cs

@@ -0,0 +1,118 @@
+// Lucene version compatibility level 9.2
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Fa
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Test the Persian Normalization Filter
+    /// 
+    /// </summary>
+
+    public class TestPersianStemFilter : BaseTokenStreamTestCase
+    {
+        internal PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
+
+        [Test]
+        public virtual void TestAnSuffix()
+        {
+            CheckOneTerm(a, "دوستان", "دوست");
+        }
+
+        [Test]
+        public virtual void TestHaSuffix()
+        {
+            CheckOneTerm(a, "كتابها", "كتاب");
+        }
+
+        [Test]
+        public virtual void TestAtSuffix()
+        {
+            CheckOneTerm(a, "جامدات", "جامد");
+        }
+
+        [Test]
+        public virtual void TestYeeSuffix()
+        {
+            CheckOneTerm(a, "عليرضايي", "عليرضا");
+        }
+
+        [Test]
+        public virtual void TestYeSuffix()
+        {
+            CheckOneTerm(a, "شادماني", "شادمان");
+        }
+
+        [Test]
+        public virtual void TestTarSuffix()
+        {
+            CheckOneTerm(a, "باحالتر", "باحال");
+        }
+
+        [Test]
+        public virtual void TestTarinSuffix()
+        {
+            CheckOneTerm(a, "خوبترين", "خوب");
+        }
+
+        [Test]
+        public virtual void TestShouldntStem()
+        {
+            CheckOneTerm(a, "كباب", "كباب");
+        }
+
+        [Test]
+        public virtual void TestNonArabic()
+        {
+            CheckOneTerm(a, "English", "english");
+        }
+
+
+        [Test]
+        public virtual void TestWithKeywordAttribute()
+        {
+            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+            set.Add("ساهدهات");
+#pragma warning disable 612, 618
+            StandardTokenizer tokenStream = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader("ساهدهات"));
+#pragma warning restore 612, 618
+
+            PersianStemFilter filter = new PersianStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
+            AssertTokenStreamContents(filter, new string[] { "ساهدهات" });
+        }
+
+        [Test]
+        public virtual void TestEmptyTerm()
+        {
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+                return new TokenStreamComponents(tokenizer, new PersianStemFilter(tokenizer));
+            });
+            CheckOneTerm(a, "", "");
+            a.Dispose();
+        }
+
+    }
+}
commit	c7ab459bdfc076fc12f7cba296f9a6ebff1bc2c9	[log] [tgz]
author	Ramin ALirezaee <raminmjj@users.noreply.github.com>	Sun May 22 17:48:26 2022 +0430
committer	GitHub <noreply@github.com>	Sun May 22 20:18:26 2022 +0700
tree	a62200b11c688dbec71a9f5891d05cd02f271b61
parent	a5c0b995cf0de3f3d7d19265728f55ef366e5a61 [diff]