blob: 53c4b28ae5c2bc9e0ffda7a21b1192d1941f36c8 [file] [log] [blame]
// Lucene version compatibility level 4.8.1
#if FEATURE_BREAKITERATOR
using J2N.Threading;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Attributes;
using Lucene.Net.Support.Threading;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.Th
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
///
/// </summary>
public class TestThaiAnalyzer : BaseTokenStreamTestCase
{
[SetUp]
public override void SetUp()
{
base.SetUp();
// LUCENENET specific - ICU always has a dictionary-based BreakIterator in .NET.
//AssumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiTokenizer.DBBI_AVAILABLE);
}
/*
* testcase for offsets
*/
[Test]
public virtual void TestOffsets()
{
AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
[Test]
public virtual void TestStopWords()
{
AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี", new string[] { "แสดง", "งาน", "ดี" }, new int[] { 13, 20, 23 }, new int[] { 17, 23, 25 }, new int[] { 5, 2, 1 });
}
[Test]
public virtual void TestBackwardsStopWords()
{
AssertAnalyzesTo(
#pragma warning disable 612, 618
new ThaiAnalyzer(LuceneVersion.LUCENE_35),
#pragma warning restore 612, 618
"การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 13, 17, 20, 23 }, new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
/// <summary>
/// Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>. </summary>
/// @deprecated (3.1) testing backwards behavior
[Test]
[Obsolete("(3.1) testing backwards behavior")]
public virtual void TestBuggyTokenType30()
{
AssertAnalyzesTo(new ThaiAnalyzer(LuceneVersion.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" }, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}
/// @deprecated (3.1) testing backwards behavior
[Test]
[Obsolete("(3.1) testing backwards behavior")]
public virtual void TestAnalyzer30()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
AssertAnalyzesTo(analyzer, "", new string[] { });
AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });
AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
// English stop words
AssertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", new string[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
}
// Ellision character
private static readonly char THAI_PAIYANNOI = (char)0x0E2F;
// Repeat character
private static readonly char THAI_MAIYAMOK = (char)0x0E46;
[Test]
[LuceneNetSpecific]
public virtual void TestThaiBreakEngineInitializerCode()
{
// Initialize UnicodeSets
var fThaiWordSet = new ICU4N.Text.UnicodeSet();
var fMarkSet = new ICU4N.Text.UnicodeSet();
var fBeginWordSet = new ICU4N.Text.UnicodeSet();
var fSuffixSet = new ICU4N.Text.UnicodeSet();
fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]");
fThaiWordSet.Compact();
fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
fMarkSet.Add(0x0020);
var fEndWordSet = new ICU4N.Text.UnicodeSet(fThaiWordSet);
fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fSuffixSet.Add(THAI_PAIYANNOI);
fSuffixSet.Add(THAI_MAIYAMOK);
// Compact for caching
fMarkSet.Compact();
fEndWordSet.Compact();
fBeginWordSet.Compact();
fSuffixSet.Compact();
// Freeze the static UnicodeSet
fThaiWordSet.Freeze();
fMarkSet.Freeze();
fEndWordSet.Freeze();
fBeginWordSet.Freeze();
fSuffixSet.Freeze();
}
/*
* Test that position increments are adjusted correctly for stopwords.
*/
// note this test uses stopfilter's stopset
[Test]
public virtual void TestPositionIncrements()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
AssertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
// case that a stopword is adjacent to thai text, with no whitespace
AssertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
}
[Test]
public virtual void TestReusableTokenStream()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
AssertAnalyzesTo(analyzer, "", new string[] { });
AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });
AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
}
/// @deprecated (3.1) for version back compat
[Test]
[Obsolete("(3.1) for version back compat")]
public virtual void TestReusableTokenStream30()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
AssertAnalyzesTo(analyzer, "", new string[] { });
AssertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", new string[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" });
AssertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", new string[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
}
[Test]
[LuceneNetSpecific]
public virtual void TestConcurrency()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
char[] chars = new char[] {
(char)4160,
(char)4124,
(char)4097,
(char)4177,
(char)4113,
(char)32,
(char)10671,
};
string contents = new string(chars);
AssertAnalyzer(analyzer, contents);
int numThreads = 4;
var startingGun = new CountdownEvent(1);
var threads = new ThaiAnalyzerThread[numThreads];
for (int i = 0; i < threads.Length; i++)
{
threads[i] = new ThaiAnalyzerThread(startingGun, analyzer, contents);
}
foreach (var thread in threads)
{
thread.Start();
}
startingGun.Signal();
foreach (var t in threads)
{
try
{
t.Join();
}
catch (Exception e) when (e.IsInterruptedException())
{
fail("Thread interrupted");
}
}
}
private class ThaiAnalyzerThread : ThreadJob
{
private readonly CountdownEvent latch;
private readonly Analyzer analyzer;
private readonly string text;
public ThaiAnalyzerThread(CountdownEvent latch, Analyzer analyzer, string text)
{
this.latch = latch;
this.analyzer = analyzer;
this.text = text;
}
public override void Run()
{
latch.Wait();
for (int i = 0; i < 1000; i++)
{
AssertAnalyzer(analyzer, text);
}
}
}
private static void AssertAnalyzer(Analyzer analyzer, string text)
{
ICharTermAttribute termAtt;
IOffsetAttribute offsetAtt;
IPositionIncrementAttribute posIncAtt;
JCG.List<string> tokens = new JCG.List<string>();
JCG.List<int> positions = new JCG.List<int>();
JCG.List<int> startOffsets = new JCG.List<int>();
JCG.List<int> endOffsets = new JCG.List<int>();
TokenStream ts;
TextReader reader = new StringReader(text);
using (ts = analyzer.GetTokenStream("dummy", reader))
{
bool isReset = false;
try
{
termAtt = ts.GetAttribute<ICharTermAttribute>();
offsetAtt = ts.GetAttribute<IOffsetAttribute>();
posIncAtt = ts.GetAttribute<IPositionIncrementAttribute>();
ts.Reset();
isReset = true;
while (ts.IncrementToken())
{
Assert.IsNotNull(termAtt, "has no CharTermAttribute");
tokens.Add(termAtt.ToString());
positions.Add(posIncAtt.PositionIncrement);
startOffsets.Add(offsetAtt.StartOffset);
endOffsets.Add(offsetAtt.EndOffset);
}
}
finally
{
if (!isReset)
{
try
{
// consume correctly
ts.Reset();
while (ts.IncrementToken()) ;
//ts.End();
//ts.Dispose();
}
#pragma warning disable 168
catch (Exception ex)
#pragma warning restore 168
{
// ignore
}
}
ts.End(); // ts.end();
}
} // ts.Dispose()
reader = new StringReader(text);
using (ts = analyzer.GetTokenStream("dummy", reader))
{
bool isReset = false;
try
{
// offset + pos
AssertTokenStreamContents(ts,
output: tokens.ToArray(),
startOffsets: ToIntArray(startOffsets),
endOffsets: ToIntArray(endOffsets),
types: null,
posIncrements: ToIntArray(positions),
posLengths: null,
finalOffset: text.Length,
offsetsAreCorrect: true);
isReset = true;
}
finally
{
if (!isReset)
{
try
{
// consume correctly
ts.Reset();
while (ts.IncrementToken()) ;
//ts.End();
//ts.Dispose();
}
#pragma warning disable 168
catch (Exception ex)
#pragma warning restore 168
{
// ignore
}
}
ts.End(); // ts.end();
}
}
} // ts.Dispose()
/// <summary>
/// blast some random strings through the analyzer </summary>
[Test]
[AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
public virtual void TestRandomStrings()
{
CheckRandomData(Random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
}
/// <summary>
/// blast some random large strings through the analyzer </summary>
///
[Test]
[AwaitsFix(BugUrl = "https://github.com/apache/lucenenet/issues/269")] // LUCENENET TODO: this test occasionally fails
public virtual void TestRandomHugeStrings()
{
Random random = Random;
CheckRandomData(random, new ThaiAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
}
// LUCENE-3044
[Test]
public virtual void TestAttributeReuse()
{
#pragma warning disable 612, 618
ThaiAnalyzer analyzer = new ThaiAnalyzer(LuceneVersion.LUCENE_30);
#pragma warning restore 612, 618
// just consume
TokenStream ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
// this consumer adds flagsAtt, which this analyzer does not use.
ts = analyzer.GetTokenStream("dummy", "ภาษาไทย");
ts.AddAttribute<IFlagsAttribute>();
AssertTokenStreamContents(ts, new string[] { "ภาษา", "ไทย" });
}
[Test]
public virtual void TestTwoSentences()
{
AssertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty), "This is a test. การที่ได้ต้องแสดงว่างานดี", new string[] { "this", "is", "a", "test", "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 5, 8, 10, 16, 19, 22, 25, 29, 33, 36, 39 }, new int[] { 4, 7, 9, 14, 19, 22, 25, 29, 33, 36, 39, 41 });
}
/// <summary>
/// LUCENENET: Tests scenario outlined in <see cref="ThaiWordBreaker"/>
/// </summary>
[Test][LuceneNetSpecific]
public virtual void TestNumeralBreaking()
{
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.Empty);
AssertAnalyzesTo(analyzer, "๑๒๓456", new String[] { "๑๒๓456" });
}
}
}
#endif