src/Lucene.Net.Tests.Analysis.Common/Analysis/Th/TestThaiAnalyzer.cs - lucenenet - Git at Google

 // Lucene version compatibility level 4.8.1
 #if FEATURE_BREAKITERATOR
 using J2N.Threading;
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Attributes;
 using Lucene.Net.Support.Threading;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Threading;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Analysis.Th
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
     ///
     /// </summary>

     public class TestThaiAnalyzer : BaseTokenStreamTestCase
     {
         [SetUp]
         public override void SetUp()
         {
             base.SetUp();
             // LUCENENET specific - ICU always has a dictionary-based BreakIterator in .NET.
             //AssumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
         }
         /*
          * testcase for offsets
          */
         [Test]
         public virtual void TestOffsets()
         {
             AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
         }

         [Test]
         public virtual void TestStopWords()
         {
             AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", new string[] { "แสดง", "งาน", "ดี" }, new int[] { 13, 20, 23 }, new int[] { 17, 23, 25 }, new int[] { 5, 2, 1 });
         }

         [Test]
         public virtual void TestBackwardsStopWords()
         {
             AssertAnalyzesTo(
 #pragma warning disable 612, 618
                 new ThaiAnalyzer(LuceneVersion.LUCENE_35),
 #pragma warning restore 612, 618
                 "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
         }

         /// <summary>
         /// Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>. </summary>
         /// @deprecated (3.1) testing backwards behavior
         [Test]
         [Obsolete("(3.1) testing backwards behavior")]
         public virtual void TestBuggyTokenType30()
         {
             AssertAnalyzesTo(new ThaiAnalyzer(LuceneVersion.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
         }

         /// @deprecated (3.1) testing backwards behavior
         [Test]
         [Obsolete("(3.1) testing backwards behavior")]
         public virtual void TestAnalyzer30()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);

             AssertAnalyzesTo(analyzer, "", new string[] { });

             AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

             AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });

             // English stop words
             AssertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", new string[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
         }

         // Ellision character
         private static readonly char THAI_PAIYANNOI = (char)0x0E2F;
         // Repeat character
         private static readonly char THAI_MAIYAMOK = (char)0x0E46;

         [Test]
         [LuceneNetSpecific]
         public virtual void TestThaiBreakEngineInitializerCode()
         {
             // Initialize UnicodeSets
             var fThaiWordSet = new ICU4N.Text.UnicodeSet();
             var fMarkSet = new ICU4N.Text.UnicodeSet();
             var fBeginWordSet = new ICU4N.Text.UnicodeSet();
             var fSuffixSet = new ICU4N.Text.UnicodeSet();

             fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]");
             fThaiWordSet.Compact();

             fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
             fMarkSet.Add(0x0020);
             var fEndWordSet = new ICU4N.Text.UnicodeSet(fThaiWordSet);
             fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT
             fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
             fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
             fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
             fSuffixSet.Add(THAI_PAIYANNOI);
             fSuffixSet.Add(THAI_MAIYAMOK);

             // Compact for caching
             fMarkSet.Compact();
             fEndWordSet.Compact();
             fBeginWordSet.Compact();
             fSuffixSet.Compact();

             // Freeze the static UnicodeSet
             fThaiWordSet.Freeze();
             fMarkSet.Freeze();
             fEndWordSet.Freeze();
             fBeginWordSet.Freeze();
             fSuffixSet.Freeze();
         }

         /*
          * Test that position increments are adjusted correctly for stopwords.
          */
         // note this test uses stopfilter's stopset
         [Test]
         public virtual void TestPositionIncrements()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
             AssertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

             // case that a stopword is adjacent to thai text, with no whitespace
             AssertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
         }

         [Test]
         public virtual void TestReusableTokenStream()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
             AssertAnalyzesTo(analyzer, "", new string[] { });

             AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

             AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
         }

         /// @deprecated (3.1) for version back compat
         [Test]
         [Obsolete("(3.1) for version back compat")]
         public virtual void TestReusableTokenStream30()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
             AssertAnalyzesTo(analyzer, "", new string[] { });

             AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

             AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
         }

         [Test]
         [LuceneNetSpecific]
         public virtual void TestConcurrency()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);

             char[] chars = new char[] {
                 (char)4160,
                 (char)4124,
                 (char)4097,
                 (char)4177,
                 (char)4113,
                 (char)32,
                 (char)10671,
             };
             string contents = new string(chars);
             AssertAnalyzer(analyzer, contents);

             int numThreads = 4;
             var startingGun = new CountdownEvent(1);
             var threads = new ThaiAnalyzerThread[numThreads];
             for (int i = 0; i < threads.Length; i++)
             {
                 threads[i] = new ThaiAnalyzerThread(startingGun, analyzer, contents);
             }

             foreach (var thread in threads)
             {
                 thread.Start();
             }

             startingGun.Signal();
             foreach (var t in threads)
             {
                 try
                 {
                     t.Join();
                 }
                 catch (Exception e) when (e.IsInterruptedException())
                 {
                     fail("Thread interrupted");
                 }
             }
         }

         private class ThaiAnalyzerThread : ThreadJob
         {
             private readonly CountdownEvent latch;
             private readonly Analyzer analyzer;
             private readonly string text;

             public ThaiAnalyzerThread(CountdownEvent latch, Analyzer analyzer, string text)
             {
                 this.latch = latch;
                 this.analyzer = analyzer;
                 this.text = text;
             }

             public override void Run()
             {
                 latch.Wait();

                 for (int i = 0; i < 1000; i++)
                 {
                     AssertAnalyzer(analyzer, text);
                 }
             }
         }

         private static void AssertAnalyzer(Analyzer analyzer, string text)
         {
             ICharTermAttribute termAtt;
             IOffsetAttribute offsetAtt;
             IPositionIncrementAttribute posIncAtt;

             JCG.List<string> tokens = new JCG.List<string>();
             JCG.List<int> positions = new JCG.List<int>();
             JCG.List<int> startOffsets = new JCG.List<int>();
             JCG.List<int> endOffsets = new JCG.List<int>();

             TokenStream ts;
             TextReader reader = new StringReader(text);

             using (ts = analyzer.GetTokenStream("dummy", reader))
             {
                 bool isReset = false;
                 try
                 {

                     termAtt = ts.GetAttribute<ICharTermAttribute>();
                     offsetAtt = ts.GetAttribute<IOffsetAttribute>();
                     posIncAtt = ts.GetAttribute<IPositionIncrementAttribute>();

                     ts.Reset();
                     isReset = true;

                     while (ts.IncrementToken())
                     {
                         Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                         tokens.Add(termAtt.ToString());
                         positions.Add(posIncAtt.PositionIncrement);
                         startOffsets.Add(offsetAtt.StartOffset);
                         endOffsets.Add(offsetAtt.EndOffset);
                     }
                 }
                 finally
                 {
                     if (!isReset)
                     {
                         try
                         {
                             // consume correctly
                             ts.Reset();
                             while (ts.IncrementToken()) ;
                             //ts.End();
                             //ts.Dispose();
                         }
 #pragma warning disable 168
                         catch (Exception ex)
 #pragma warning restore 168
                         {
                             // ignore
                         }
                     }
                     ts.End(); // ts.end();
                 }
             } // ts.Dispose()

             reader = new StringReader(text);
             using (ts = analyzer.GetTokenStream("dummy", reader))
             {
                 bool isReset = false;
                 try
                 {

                     // offset + pos
                     AssertTokenStreamContents(ts,
                         output: tokens.ToArray(),
                         startOffsets: ToIntArray(startOffsets),
                         endOffsets: ToIntArray(endOffsets),
                         types: null,
                         posIncrements: ToIntArray(positions),
                         posLengths: null,
                         finalOffset: text.Length,
                         offsetsAreCorrect: true);

                     isReset = true;
                 }
                 finally
                 {
                     if (!isReset)
                     {
                         try
                         {
                             // consume correctly
                             ts.Reset();
                             while (ts.IncrementToken()) ;
                             //ts.End();
                             //ts.Dispose();
                         }
 #pragma warning disable 168
                         catch (Exception ex)
 #pragma warning restore 168
                         {
                             // ignore
                         }
                     }
                     ts.End(); // ts.end();
                 }
             }

         } // ts.Dispose()


         /// <summary>
         /// blast some random strings through the analyzer </summary>
         [Test]
         [AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
         public virtual void TestRandomStrings()
         {
             CheckRandomData(Random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
         }

         /// <summary>
         /// blast some random large strings through the analyzer </summary>
         ///
         [Test]
         [AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
         public virtual void TestRandomHugeStrings()
         {
             Random random = Random;
             CheckRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
         }

         // LUCENE-3044
         [Test]
         public virtual void TestAttributeReuse()
         {
 #pragma warning disable 612, 618
             ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
 #pragma warning restore 612, 618
             // just consume
             TokenStream ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
             AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
             // this consumer adds flagsAtt, which this analyzer does not use.
             ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
             ts.AddAttribute<IFlagsAttribute>();
             AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
         }

         [Test]
         public virtual void TestTwoSentences()
         {
             AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "This is a test. การที่ได้ต้องแสดงว่างานดี", new string[] { "this", "is", "a", "test", "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 5, 8, 10, 16, 19, 22, 25, 29, 33, 36, 39 }, new int[] { 4, 7, 9, 14, 19, 22, 25, 29, 33, 36, 39, 41 });
         }

         /// <summary>
         /// LUCENENET: Tests scenario outlined in <see cref="ThaiWordBreaker"/>
         /// </summary>
         [Test][LuceneNetSpecific]
         public virtual void TestNumeralBreaking()
         {
             ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
             AssertAnalyzesTo(analyzer, "๑๒๓456", new String[] { "๑๒๓456" });
         }
     }
 }
 #endif
	// Lucene version compatibility level 4.8.1
	#if FEATURE_BREAKITERATOR
	using J2N.Threading;
	using Lucene.Net.Analysis.Core;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Attributes;
	using Lucene.Net.Support.Threading;
	using Lucene.Net.Util;
	using NUnit.Framework;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Threading;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Analysis.Th
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
	///
	/// </summary>

	public class TestThaiAnalyzer : BaseTokenStreamTestCase
	{
	[SetUp]
	public override void SetUp()
	{
	base.SetUp();
	// LUCENENET specific - ICU always has a dictionary-based BreakIterator in .NET.
	//AssumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
	}
	/*
	* testcase for offsets
	*/
	[Test]
	public virtual void TestOffsets()
	{
	AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
	}

	[Test]
	public virtual void TestStopWords()
	{
	AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", new string[] { "แสดง", "งาน", "ดี" }, new int[] { 13, 20, 23 }, new int[] { 17, 23, 25 }, new int[] { 5, 2, 1 });
	}

	[Test]
	public virtual void TestBackwardsStopWords()
	{
	AssertAnalyzesTo(
	#pragma warning disable 612, 618
	new ThaiAnalyzer(LuceneVersion.LUCENE_35),
	#pragma warning restore 612, 618
	"การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
	}

	/// <summary>
	/// Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>. </summary>
	/// @deprecated (3.1) testing backwards behavior
	[Test]
	[Obsolete("(3.1) testing backwards behavior")]
	public virtual void TestBuggyTokenType30()
	{
	AssertAnalyzesTo(new ThaiAnalyzer(LuceneVersion.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
	}

	/// @deprecated (3.1) testing backwards behavior
	[Test]
	[Obsolete("(3.1) testing backwards behavior")]
	public virtual void TestAnalyzer30()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);

	AssertAnalyzesTo(analyzer, "", new string[] { });

	AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

	AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });

	// English stop words
	AssertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", new string[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
	}

	// Ellision character
	private static readonly char THAI_PAIYANNOI = (char)0x0E2F;
	// Repeat character
	private static readonly char THAI_MAIYAMOK = (char)0x0E46;

	[Test]
	[LuceneNetSpecific]
	public virtual void TestThaiBreakEngineInitializerCode()
	{
	// Initialize UnicodeSets
	var fThaiWordSet = new ICU4N.Text.UnicodeSet();
	var fMarkSet = new ICU4N.Text.UnicodeSet();
	var fBeginWordSet = new ICU4N.Text.UnicodeSet();
	var fSuffixSet = new ICU4N.Text.UnicodeSet();

	fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]");
	fThaiWordSet.Compact();

	fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
	fMarkSet.Add(0x0020);
	var fEndWordSet = new ICU4N.Text.UnicodeSet(fThaiWordSet);
	fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT
	fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
	fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
	fSuffixSet.Add(THAI_PAIYANNOI);
	fSuffixSet.Add(THAI_MAIYAMOK);

	// Compact for caching
	fMarkSet.Compact();
	fEndWordSet.Compact();
	fBeginWordSet.Compact();
	fSuffixSet.Compact();

	// Freeze the static UnicodeSet
	fThaiWordSet.Freeze();
	fMarkSet.Freeze();
	fEndWordSet.Freeze();
	fBeginWordSet.Freeze();
	fSuffixSet.Freeze();
	}

	/*
	* Test that position increments are adjusted correctly for stopwords.
	*/
	// note this test uses stopfilter's stopset
	[Test]
	public virtual void TestPositionIncrements()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
	AssertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

	// case that a stopword is adjacent to thai text, with no whitespace
	AssertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
	}

	[Test]
	public virtual void TestReusableTokenStream()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
	AssertAnalyzesTo(analyzer, "", new string[] { });

	AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

	AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
	}

	/// @deprecated (3.1) for version back compat
	[Test]
	[Obsolete("(3.1) for version back compat")]
	public virtual void TestReusableTokenStream30()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
	AssertAnalyzesTo(analyzer, "", new string[] { });

	AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });

	AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
	}

	[Test]
	[LuceneNetSpecific]
	public virtual void TestConcurrency()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);

	char[] chars = new char[] {
	(char)4160,
	(char)4124,
	(char)4097,
	(char)4177,
	(char)4113,
	(char)32,
	(char)10671,
	};
	string contents = new string(chars);
	AssertAnalyzer(analyzer, contents);

	int numThreads = 4;
	var startingGun = new CountdownEvent(1);
	var threads = new ThaiAnalyzerThread[numThreads];
	for (int i = 0; i < threads.Length; i++)
	{
	threads[i] = new ThaiAnalyzerThread(startingGun, analyzer, contents);
	}

	foreach (var thread in threads)
	{
	thread.Start();
	}

	startingGun.Signal();
	foreach (var t in threads)
	{
	try
	{
	t.Join();
	}
	catch (Exception e) when (e.IsInterruptedException())
	{
	fail("Thread interrupted");
	}
	}
	}

	private class ThaiAnalyzerThread : ThreadJob
	{
	private readonly CountdownEvent latch;
	private readonly Analyzer analyzer;
	private readonly string text;

	public ThaiAnalyzerThread(CountdownEvent latch, Analyzer analyzer, string text)
	{
	this.latch = latch;
	this.analyzer = analyzer;
	this.text = text;
	}

	public override void Run()
	{
	latch.Wait();

	for (int i = 0; i < 1000; i++)
	{
	AssertAnalyzer(analyzer, text);
	}
	}
	}

	private static void AssertAnalyzer(Analyzer analyzer, string text)
	{
	ICharTermAttribute termAtt;
	IOffsetAttribute offsetAtt;
	IPositionIncrementAttribute posIncAtt;

	JCG.List<string> tokens = new JCG.List<string>();
	JCG.List<int> positions = new JCG.List<int>();
	JCG.List<int> startOffsets = new JCG.List<int>();
	JCG.List<int> endOffsets = new JCG.List<int>();

	TokenStream ts;
	TextReader reader = new StringReader(text);

	using (ts = analyzer.GetTokenStream("dummy", reader))
	{
	bool isReset = false;
	try
	{

	termAtt = ts.GetAttribute<ICharTermAttribute>();
	offsetAtt = ts.GetAttribute<IOffsetAttribute>();
	posIncAtt = ts.GetAttribute<IPositionIncrementAttribute>();

	ts.Reset();
	isReset = true;

	while (ts.IncrementToken())
	{
	Assert.IsNotNull(termAtt, "has no CharTermAttribute");
	tokens.Add(termAtt.ToString());
	positions.Add(posIncAtt.PositionIncrement);
	startOffsets.Add(offsetAtt.StartOffset);
	endOffsets.Add(offsetAtt.EndOffset);
	}
	}
	finally
	{
	if (!isReset)
	{
	try
	{
	// consume correctly
	ts.Reset();
	while (ts.IncrementToken()) ;
	//ts.End();
	//ts.Dispose();
	}
	#pragma warning disable 168
	catch (Exception ex)
	#pragma warning restore 168
	{
	// ignore
	}
	}
	ts.End(); // ts.end();
	}
	} // ts.Dispose()

	reader = new StringReader(text);
	using (ts = analyzer.GetTokenStream("dummy", reader))
	{
	bool isReset = false;
	try
	{

	// offset + pos
	AssertTokenStreamContents(ts,
	output: tokens.ToArray(),
	startOffsets: ToIntArray(startOffsets),
	endOffsets: ToIntArray(endOffsets),
	types: null,
	posIncrements: ToIntArray(positions),
	posLengths: null,
	finalOffset: text.Length,
	offsetsAreCorrect: true);

	isReset = true;
	}
	finally
	{
	if (!isReset)
	{
	try
	{
	// consume correctly
	ts.Reset();
	while (ts.IncrementToken()) ;
	//ts.End();
	//ts.Dispose();
	}
	#pragma warning disable 168
	catch (Exception ex)
	#pragma warning restore 168
	{
	// ignore
	}
	}
	ts.End(); // ts.end();
	}
	}

	} // ts.Dispose()


	/// <summary>
	/// blast some random strings through the analyzer </summary>
	[Test]
	[AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
	public virtual void TestRandomStrings()
	{
	CheckRandomData(Random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
	}

	/// <summary>
	/// blast some random large strings through the analyzer </summary>
	///
	[Test]
	[AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
	public virtual void TestRandomHugeStrings()
	{
	Random random = Random;
	CheckRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
	}

	// LUCENE-3044
	[Test]
	public virtual void TestAttributeReuse()
	{
	#pragma warning disable 612, 618
	ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
	#pragma warning restore 612, 618
	// just consume
	TokenStream ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
	AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
	// this consumer adds flagsAtt, which this analyzer does not use.
	ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
	ts.AddAttribute<IFlagsAttribute>();
	AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
	}

	[Test]
	public virtual void TestTwoSentences()
	{
	AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "This is a test. การที่ได้ต้องแสดงว่างานดี", new string[] { "this", "is", "a", "test", "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 5, 8, 10, 16, 19, 22, 25, 29, 33, 36, 39 }, new int[] { 4, 7, 9, 14, 19, 22, 25, 29, 33, 36, 39, 41 });
	}

	/// <summary>
	/// LUCENENET: Tests scenario outlined in <see cref="ThaiWordBreaker"/>
	/// </summary>
	[Test][LuceneNetSpecific]
	public virtual void TestNumeralBreaking()
	{
	ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
	AssertAnalyzesTo(analyzer, "๑๒๓456", new String[] { "๑๒๓456" });
	}
	}
	}
	#endif