src/Lucene.Net.Tests.Analysis.Common/Analysis/Fa/TestPersianAnalyzer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Util;
 using NUnit.Framework;

 namespace Lucene.Net.Analysis.Fa
 {
     /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
 	 * The ASF licenses this file to You under the Apache License, Version 2.0
 	 * (the "License"); you may not use this file except in compliance with
 	 * the License.  You may obtain a copy of the License at
 	 *
 	 *     http://www.apache.org/licenses/LICENSE-2.0
 	 *
 	 * Unless required by applicable law or agreed to in writing, software
 	 * distributed under the License is distributed on an "AS IS" BASIS,
 	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 	 * See the License for the specific language governing permissions and
 	 * limitations under the License.
 	 */

     /// <summary>
     /// Test the Persian Analyzer
     ///
     /// </summary>
     public class TestPersianAnalyzer : BaseTokenStreamTestCase
     {

         /// <summary>
         /// This test fails with NPE when the stopwords file is missing in classpath
         /// </summary>
         public virtual void TestResourcesAvailable()
         {
             new PersianAnalyzer(TEST_VERSION_CURRENT);
         }

         /// <summary>
         /// This test shows how the combination of tokenization (breaking on zero-width
         /// non-joiner), normalization (such as treating arabic YEH and farsi YEH the
         /// same), and stopwords creates a light-stemming effect for verbs.
         ///
         /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
         /// </summary>
         [Test]
         public virtual void TestBehaviorVerbs()
         {
             Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
             // active present indicative
             AssertAnalyzesTo(a, "می‌خورد", new string[] { "خورد" });
             // active preterite indicative
             AssertAnalyzesTo(a, "خورد", new string[] { "خورد" });
             // active imperfective preterite indicative
             AssertAnalyzesTo(a, "می‌خورد", new string[] { "خورد" });
             // active future indicative
             AssertAnalyzesTo(a, "خواهد خورد", new string[] { "خورد" });
             // active present progressive indicative
             AssertAnalyzesTo(a, "دارد می‌خورد", new string[] { "خورد" });
             // active preterite progressive indicative
             AssertAnalyzesTo(a, "داشت می‌خورد", new string[] { "خورد" });

             // active perfect indicative
             AssertAnalyzesTo(a, "خورده‌است", new string[] { "خورده" });
             // active imperfective perfect indicative
             AssertAnalyzesTo(a, "می‌خورده‌است", new string[] { "خورده" });
             // active pluperfect indicative
             AssertAnalyzesTo(a, "خورده بود", new string[] { "خورده" });
             // active imperfective pluperfect indicative
             AssertAnalyzesTo(a, "می‌خورده بود", new string[] { "خورده" });
             // active preterite subjunctive
             AssertAnalyzesTo(a, "خورده باشد", new string[] { "خورده" });
             // active imperfective preterite subjunctive
             AssertAnalyzesTo(a, "می‌خورده باشد", new string[] { "خورده" });
             // active pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده بوده باشد", new string[] { "خورده" });
             // active imperfective pluperfect subjunctive
             AssertAnalyzesTo(a, "می‌خورده بوده باشد", new string[] { "خورده" });
             // passive present indicative
             AssertAnalyzesTo(a, "خورده می‌شود", new string[] { "خورده" });
             // passive preterite indicative
             AssertAnalyzesTo(a, "خورده شد", new string[] { "خورده" });
             // passive imperfective preterite indicative
             AssertAnalyzesTo(a, "خورده می‌شد", new string[] { "خورده" });
             // passive perfect indicative
             AssertAnalyzesTo(a, "خورده شده‌است", new string[] { "خورده" });
             // passive imperfective perfect indicative
             AssertAnalyzesTo(a, "خورده می‌شده‌است", new string[] { "خورده" });
             // passive pluperfect indicative
             AssertAnalyzesTo(a, "خورده شده بود", new string[] { "خورده" });
             // passive imperfective pluperfect indicative
             AssertAnalyzesTo(a, "خورده می‌شده بود", new string[] { "خورده" });
             // passive future indicative
             AssertAnalyzesTo(a, "خورده خواهد شد", new string[] { "خورده" });
             // passive present progressive indicative
             AssertAnalyzesTo(a, "دارد خورده می‌شود", new string[] { "خورده" });
             // passive preterite progressive indicative
             AssertAnalyzesTo(a, "داشت خورده می‌شد", new string[] { "خورده" });
             // passive present subjunctive
             AssertAnalyzesTo(a, "خورده شود", new string[] { "خورده" });
             // passive preterite subjunctive
             AssertAnalyzesTo(a, "خورده شده باشد", new string[] { "خورده" });
             // passive imperfective preterite subjunctive
             AssertAnalyzesTo(a, "خورده می‌شده باشد", new string[] { "خورده" });
             // passive pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده شده بوده باشد", new string[] { "خورده" });
             // passive imperfective pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده می‌شده بوده باشد", new string[] { "خورده" });

             // active present subjunctive
             AssertAnalyzesTo(a, "بخورد", new string[] { "بخورد" });
         }

         /// <summary>
         /// This test shows how the combination of tokenization and stopwords creates a
         /// light-stemming effect for verbs.
         ///
         /// In this case, these forms are presented with alternative orthography, using
         /// arabic yeh and whitespace. This yeh phenomenon is common for legacy text
         /// due to some previous bugs in Microsoft Windows.
         ///
         /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
         /// </summary>
         [Test]
         public virtual void TestBehaviorVerbsDefective()
         {
             Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
             // active present indicative
             AssertAnalyzesTo(a, "مي خورد", new string[] { "خورد" });
             // active preterite indicative
             AssertAnalyzesTo(a, "خورد", new string[] { "خورد" });
             // active imperfective preterite indicative
             AssertAnalyzesTo(a, "مي خورد", new string[] { "خورد" });
             // active future indicative
             AssertAnalyzesTo(a, "خواهد خورد", new string[] { "خورد" });
             // active present progressive indicative
             AssertAnalyzesTo(a, "دارد مي خورد", new string[] { "خورد" });
             // active preterite progressive indicative
             AssertAnalyzesTo(a, "داشت مي خورد", new string[] { "خورد" });

             // active perfect indicative
             AssertAnalyzesTo(a, "خورده است", new string[] { "خورده" });
             // active imperfective perfect indicative
             AssertAnalyzesTo(a, "مي خورده است", new string[] { "خورده" });
             // active pluperfect indicative
             AssertAnalyzesTo(a, "خورده بود", new string[] { "خورده" });
             // active imperfective pluperfect indicative
             AssertAnalyzesTo(a, "مي خورده بود", new string[] { "خورده" });
             // active preterite subjunctive
             AssertAnalyzesTo(a, "خورده باشد", new string[] { "خورده" });
             // active imperfective preterite subjunctive
             AssertAnalyzesTo(a, "مي خورده باشد", new string[] { "خورده" });
             // active pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده بوده باشد", new string[] { "خورده" });
             // active imperfective pluperfect subjunctive
             AssertAnalyzesTo(a, "مي خورده بوده باشد", new string[] { "خورده" });
             // passive present indicative
             AssertAnalyzesTo(a, "خورده مي شود", new string[] { "خورده" });
             // passive preterite indicative
             AssertAnalyzesTo(a, "خورده شد", new string[] { "خورده" });
             // passive imperfective preterite indicative
             AssertAnalyzesTo(a, "خورده مي شد", new string[] { "خورده" });
             // passive perfect indicative
             AssertAnalyzesTo(a, "خورده شده است", new string[] { "خورده" });
             // passive imperfective perfect indicative
             AssertAnalyzesTo(a, "خورده مي شده است", new string[] { "خورده" });
             // passive pluperfect indicative
             AssertAnalyzesTo(a, "خورده شده بود", new string[] { "خورده" });
             // passive imperfective pluperfect indicative
             AssertAnalyzesTo(a, "خورده مي شده بود", new string[] { "خورده" });
             // passive future indicative
             AssertAnalyzesTo(a, "خورده خواهد شد", new string[] { "خورده" });
             // passive present progressive indicative
             AssertAnalyzesTo(a, "دارد خورده مي شود", new string[] { "خورده" });
             // passive preterite progressive indicative
             AssertAnalyzesTo(a, "داشت خورده مي شد", new string[] { "خورده" });
             // passive present subjunctive
             AssertAnalyzesTo(a, "خورده شود", new string[] { "خورده" });
             // passive preterite subjunctive
             AssertAnalyzesTo(a, "خورده شده باشد", new string[] { "خورده" });
             // passive imperfective preterite subjunctive
             AssertAnalyzesTo(a, "خورده مي شده باشد", new string[] { "خورده" });
             // passive pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده شده بوده باشد", new string[] { "خورده" });
             // passive imperfective pluperfect subjunctive
             AssertAnalyzesTo(a, "خورده مي شده بوده باشد", new string[] { "خورده" });

             // active present subjunctive
             AssertAnalyzesTo(a, "بخورد", new string[] { "بخورد" });
         }

         /// <summary>
         /// This test shows how the combination of tokenization (breaking on zero-width
         /// non-joiner or space) and stopwords creates a light-stemming effect for
         /// nouns, removing the plural -ha.
         /// </summary>
         [Test]
         public virtual void TestBehaviorNouns()
         {
             Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
             AssertAnalyzesTo(a, "برگ ها", new string[] { "برگ" });
             AssertAnalyzesTo(a, "برگ‌ها", new string[] { "برگ" });
         }

         /// <summary>
         /// Test showing that non-persian text is treated very much like SimpleAnalyzer
         /// (lowercased, etc)
         /// </summary>
         [Test]
         public virtual void TestBehaviorNonPersian()
         {
             Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
             AssertAnalyzesTo(a, "English test.", new string[] { "english", "test" });
         }

         /// <summary>
         /// Basic test ensuring that tokenStream works correctly.
         /// </summary>
         [Test]
         public virtual void TestReusableTokenStream()
         {
             Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
             AssertAnalyzesTo(a, "خورده مي شده بوده باشد", new string[] { "خورده" });
             AssertAnalyzesTo(a, "برگ‌ها", new string[] { "برگ" });
         }

         /// <summary>
         /// Test that custom stopwords work, and are not case-sensitive.
         /// </summary>
         [Test]
         public virtual void TestCustomStopwords()
         {
             PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, new CharArraySet(TEST_VERSION_CURRENT, AsSet("the", "and", "a"), false));
             AssertAnalyzesTo(a, "The quick brown fox.", new string[] { "quick", "brown", "fox" });
         }

         /// <summary>
         /// blast some random strings through the analyzer </summary>
         [Test]
         public virtual void TestRandomStrings()
         {
             CheckRandomData(Random, new PersianAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
         }
     }
 }
	using Lucene.Net.Analysis.Util;
	using NUnit.Framework;

	namespace Lucene.Net.Analysis.Fa
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Test the Persian Analyzer
	///
	/// </summary>
	public class TestPersianAnalyzer : BaseTokenStreamTestCase
	{

	/// <summary>
	/// This test fails with NPE when the stopwords file is missing in classpath
	/// </summary>
	public virtual void TestResourcesAvailable()
	{
	new PersianAnalyzer(TEST_VERSION_CURRENT);
	}

	/// <summary>
	/// This test shows how the combination of tokenization (breaking on zero-width
	/// non-joiner), normalization (such as treating arabic YEH and farsi YEH the
	/// same), and stopwords creates a light-stemming effect for verbs.
	///
	/// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
	/// </summary>
	[Test]
	public virtual void TestBehaviorVerbs()
	{
	Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
	// active present indicative
	AssertAnalyzesTo(a, "می‌خورد", new string[] { "خورد" });
	// active preterite indicative
	AssertAnalyzesTo(a, "خورد", new string[] { "خورد" });
	// active imperfective preterite indicative
	AssertAnalyzesTo(a, "می‌خورد", new string[] { "خورد" });
	// active future indicative
	AssertAnalyzesTo(a, "خواهد خورد", new string[] { "خورد" });
	// active present progressive indicative
	AssertAnalyzesTo(a, "دارد می‌خورد", new string[] { "خورد" });
	// active preterite progressive indicative
	AssertAnalyzesTo(a, "داشت می‌خورد", new string[] { "خورد" });

	// active perfect indicative
	AssertAnalyzesTo(a, "خورده‌است", new string[] { "خورده" });
	// active imperfective perfect indicative
	AssertAnalyzesTo(a, "می‌خورده‌است", new string[] { "خورده" });
	// active pluperfect indicative
	AssertAnalyzesTo(a, "خورده بود", new string[] { "خورده" });
	// active imperfective pluperfect indicative
	AssertAnalyzesTo(a, "می‌خورده بود", new string[] { "خورده" });
	// active preterite subjunctive
	AssertAnalyzesTo(a, "خورده باشد", new string[] { "خورده" });
	// active imperfective preterite subjunctive
	AssertAnalyzesTo(a, "می‌خورده باشد", new string[] { "خورده" });
	// active pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده بوده باشد", new string[] { "خورده" });
	// active imperfective pluperfect subjunctive
	AssertAnalyzesTo(a, "می‌خورده بوده باشد", new string[] { "خورده" });
	// passive present indicative
	AssertAnalyzesTo(a, "خورده می‌شود", new string[] { "خورده" });
	// passive preterite indicative
	AssertAnalyzesTo(a, "خورده شد", new string[] { "خورده" });
	// passive imperfective preterite indicative
	AssertAnalyzesTo(a, "خورده می‌شد", new string[] { "خورده" });
	// passive perfect indicative
	AssertAnalyzesTo(a, "خورده شده‌است", new string[] { "خورده" });
	// passive imperfective perfect indicative
	AssertAnalyzesTo(a, "خورده می‌شده‌است", new string[] { "خورده" });
	// passive pluperfect indicative
	AssertAnalyzesTo(a, "خورده شده بود", new string[] { "خورده" });
	// passive imperfective pluperfect indicative
	AssertAnalyzesTo(a, "خورده می‌شده بود", new string[] { "خورده" });
	// passive future indicative
	AssertAnalyzesTo(a, "خورده خواهد شد", new string[] { "خورده" });
	// passive present progressive indicative
	AssertAnalyzesTo(a, "دارد خورده می‌شود", new string[] { "خورده" });
	// passive preterite progressive indicative
	AssertAnalyzesTo(a, "داشت خورده می‌شد", new string[] { "خورده" });
	// passive present subjunctive
	AssertAnalyzesTo(a, "خورده شود", new string[] { "خورده" });
	// passive preterite subjunctive
	AssertAnalyzesTo(a, "خورده شده باشد", new string[] { "خورده" });
	// passive imperfective preterite subjunctive
	AssertAnalyzesTo(a, "خورده می‌شده باشد", new string[] { "خورده" });
	// passive pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده شده بوده باشد", new string[] { "خورده" });
	// passive imperfective pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده می‌شده بوده باشد", new string[] { "خورده" });

	// active present subjunctive
	AssertAnalyzesTo(a, "بخورد", new string[] { "بخورد" });
	}

	/// <summary>
	/// This test shows how the combination of tokenization and stopwords creates a
	/// light-stemming effect for verbs.
	///
	/// In this case, these forms are presented with alternative orthography, using
	/// arabic yeh and whitespace. This yeh phenomenon is common for legacy text
	/// due to some previous bugs in Microsoft Windows.
	///
	/// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
	/// </summary>
	[Test]
	public virtual void TestBehaviorVerbsDefective()
	{
	Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
	// active present indicative
	AssertAnalyzesTo(a, "مي خورد", new string[] { "خورد" });
	// active preterite indicative
	AssertAnalyzesTo(a, "خورد", new string[] { "خورد" });
	// active imperfective preterite indicative
	AssertAnalyzesTo(a, "مي خورد", new string[] { "خورد" });
	// active future indicative
	AssertAnalyzesTo(a, "خواهد خورد", new string[] { "خورد" });
	// active present progressive indicative
	AssertAnalyzesTo(a, "دارد مي خورد", new string[] { "خورد" });
	// active preterite progressive indicative
	AssertAnalyzesTo(a, "داشت مي خورد", new string[] { "خورد" });

	// active perfect indicative
	AssertAnalyzesTo(a, "خورده است", new string[] { "خورده" });
	// active imperfective perfect indicative
	AssertAnalyzesTo(a, "مي خورده است", new string[] { "خورده" });
	// active pluperfect indicative
	AssertAnalyzesTo(a, "خورده بود", new string[] { "خورده" });
	// active imperfective pluperfect indicative
	AssertAnalyzesTo(a, "مي خورده بود", new string[] { "خورده" });
	// active preterite subjunctive
	AssertAnalyzesTo(a, "خورده باشد", new string[] { "خورده" });
	// active imperfective preterite subjunctive
	AssertAnalyzesTo(a, "مي خورده باشد", new string[] { "خورده" });
	// active pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده بوده باشد", new string[] { "خورده" });
	// active imperfective pluperfect subjunctive
	AssertAnalyzesTo(a, "مي خورده بوده باشد", new string[] { "خورده" });
	// passive present indicative
	AssertAnalyzesTo(a, "خورده مي شود", new string[] { "خورده" });
	// passive preterite indicative
	AssertAnalyzesTo(a, "خورده شد", new string[] { "خورده" });
	// passive imperfective preterite indicative
	AssertAnalyzesTo(a, "خورده مي شد", new string[] { "خورده" });
	// passive perfect indicative
	AssertAnalyzesTo(a, "خورده شده است", new string[] { "خورده" });
	// passive imperfective perfect indicative
	AssertAnalyzesTo(a, "خورده مي شده است", new string[] { "خورده" });
	// passive pluperfect indicative
	AssertAnalyzesTo(a, "خورده شده بود", new string[] { "خورده" });
	// passive imperfective pluperfect indicative
	AssertAnalyzesTo(a, "خورده مي شده بود", new string[] { "خورده" });
	// passive future indicative
	AssertAnalyzesTo(a, "خورده خواهد شد", new string[] { "خورده" });
	// passive present progressive indicative
	AssertAnalyzesTo(a, "دارد خورده مي شود", new string[] { "خورده" });
	// passive preterite progressive indicative
	AssertAnalyzesTo(a, "داشت خورده مي شد", new string[] { "خورده" });
	// passive present subjunctive
	AssertAnalyzesTo(a, "خورده شود", new string[] { "خورده" });
	// passive preterite subjunctive
	AssertAnalyzesTo(a, "خورده شده باشد", new string[] { "خورده" });
	// passive imperfective preterite subjunctive
	AssertAnalyzesTo(a, "خورده مي شده باشد", new string[] { "خورده" });
	// passive pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده شده بوده باشد", new string[] { "خورده" });
	// passive imperfective pluperfect subjunctive
	AssertAnalyzesTo(a, "خورده مي شده بوده باشد", new string[] { "خورده" });

	// active present subjunctive
	AssertAnalyzesTo(a, "بخورد", new string[] { "بخورد" });
	}

	/// <summary>
	/// This test shows how the combination of tokenization (breaking on zero-width
	/// non-joiner or space) and stopwords creates a light-stemming effect for
	/// nouns, removing the plural -ha.
	/// </summary>
	[Test]
	public virtual void TestBehaviorNouns()
	{
	Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(a, "برگ ها", new string[] { "برگ" });
	AssertAnalyzesTo(a, "برگ‌ها", new string[] { "برگ" });
	}

	/// <summary>
	/// Test showing that non-persian text is treated very much like SimpleAnalyzer
	/// (lowercased, etc)
	/// </summary>
	[Test]
	public virtual void TestBehaviorNonPersian()
	{
	Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(a, "English test.", new string[] { "english", "test" });
	}

	/// <summary>
	/// Basic test ensuring that tokenStream works correctly.
	/// </summary>
	[Test]
	public virtual void TestReusableTokenStream()
	{
	Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(a, "خورده مي شده بوده باشد", new string[] { "خورده" });
	AssertAnalyzesTo(a, "برگ‌ها", new string[] { "برگ" });
	}

	/// <summary>
	/// Test that custom stopwords work, and are not case-sensitive.
	/// </summary>
	[Test]
	public virtual void TestCustomStopwords()
	{
	PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, new CharArraySet(TEST_VERSION_CURRENT, AsSet("the", "and", "a"), false));
	AssertAnalyzesTo(a, "The quick brown fox.", new string[] { "quick", "brown", "fox" });
	}

	/// <summary>
	/// blast some random strings through the analyzer </summary>
	[Test]
	public virtual void TestRandomStrings()
	{
	CheckRandomData(Random, new PersianAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
	}
	}
	}