blob: 95a4e0ad078b1fe1323cb261c2d57aefef12edda [file] [log] [blame]
using ICU4N.Text;
using Lucene.Net.Attributes;
using Lucene.Net.Support;
using NUnit.Framework;
using System;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Tests.ICU.Support
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Tests to ensure compatibility with JDK 7
/// </summary>
[TestFixture]
//[Ignore("Ignore tests until a JavaBreakIterator using RuleBasedBreakIterator is written. The rules for JavaBreakIterator can be found at:\n"
// + "http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/7u40-b43/sun/text/resources/BreakIteratorRules.java/ \n"
// + "and http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/7u40-b43/sun/text/resources/BreakIteratorRules_th.java#BreakIteratorRules_th")]
public class TestJdkBreakIterator
{
const String TEXT =
"Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java.";
private BreakIterator GetWordInstance(System.Globalization.CultureInfo locale)
{
return JdkBreakIterator.GetWordInstance(locale);
}
[Test]
[LuceneNetSpecific]
public void TestWordIteration()
{
BreakIterator bi = GetWordInstance(System.Globalization.CultureInfo.InvariantCulture);
bi.SetText("");
// Test empty
Assert.AreEqual(0, bi.Current);
Assert.AreEqual(BreakIterator.Done, bi.Next());
Assert.AreEqual(0, bi.Current);
bi.SetText(TEXT);
// Ensure position starts at 0 when initialized
Assert.AreEqual(0, bi.Current);
// Check first boundary (Apache^)
Assert.AreEqual(6, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(6, bi.Current);
// Check second boundary (^Lucene)
Assert.AreEqual(7, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(7, bi.Current);
// Check third boundary (Lucene^)
Assert.AreEqual(13, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(13, bi.Current);
// Check fourth boundary (^TM)
Assert.AreEqual(14, bi.Next());
// Check fifth boundary (TM^)
Assert.AreEqual(16, bi.Next());
// Check sixth boundary (TM)^
Assert.AreEqual(17, bi.Next());
// Check seventh boundary (^is)
Assert.AreEqual(18, bi.Next());
// Move to (^high-performance)
bi.Next();
bi.Next();
bi.Next();
// Check next boundary (^high-performance)
Assert.AreEqual(23, bi.Next());
// Ensure we don't break on hyphen (high-performance^)
Assert.AreEqual(39, bi.Next());
// Check MoveLast()
Assert.AreEqual(107, bi.Last());
// Check going past last boundary
Assert.AreEqual(BreakIterator.Done, bi.Next());
// Check we are still at last boundary
Assert.AreEqual(107, bi.Current);
// Check MoveFirst()
Assert.AreEqual(0, bi.First());
// Check going past first boundary
Assert.AreEqual(BreakIterator.Done, bi.Previous());
// Check we are still at first boundary
Assert.AreEqual(0, bi.Current);
}
[Ignore("We did not make a dictionary based break iterator with custom Thai rules, so this has the default behavior of ICU rather than the JDK")]
[Test]
[LuceneNetSpecific]
public void TestWordIterationThai()
{
BreakIterator bi = GetWordInstance(new System.Globalization.CultureInfo("th"));
bi.SetText("");
// Test empty
Assert.AreEqual(0, bi.Current);
Assert.AreEqual(BreakIterator.Done, bi.Next());
Assert.AreEqual(0, bi.Current);
bi.SetText("บริษัทMicrosoftบริการดีที่สุด");
// Ensure position starts at 0 when initialized
Assert.AreEqual(0, bi.Current);
// Check first boundary (บริษัท^Microsoft)
Assert.AreEqual(6, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(6, bi.Current);
// Check second boundary (Microsoft^บริการ)
Assert.AreEqual(15, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(15, bi.Current);
// Check third boundary (บริการ^ดี)
Assert.AreEqual(21, bi.Next());
// Ensure Current returns the last boundary iterated to
Assert.AreEqual(21, bi.Current);
// Check fourth boundary (ดี^ที่สุด)
Assert.AreEqual(23, bi.Next());
// Check fifth boundary (ดีที่สุด^)
Assert.AreEqual(29, bi.Next());
// Check beyond last boundary (ดีที่สุด)^
Assert.AreEqual(BreakIterator.Done, bi.Next());
// Check we are still at last boundary
Assert.AreEqual(29, bi.Current);
// Check MovePrevious() (ดี^ที่สุด)
Assert.AreEqual(23, bi.Previous());
// Check MoveFirst()
Assert.AreEqual(0, bi.First());
// Check going past first boundary
Assert.AreEqual(BreakIterator.Done, bi.Previous());
// Check we are still at first boundary
Assert.AreEqual(0, bi.Current);
// Check Numerals
bi.SetText("๑23๔๕๖7");
// Ensure position starts at 0 when initialized
Assert.AreEqual(0, bi.Current);
// Ensure Hindu and Thai numerals stay in one group
Assert.AreEqual(7, bi.Next());
}
const String SENTENCE_TEXT =
"Apache Lucene(TM) is a high-performance, full-featured text\nsearch engine library written entirely in Java. " +
"It is a technology suitable for nearly any application that requires" +
"full-text search, especially cross-platform. Apache Lucene is an open source project available for free download.\n" +
"Lucene makes finding things easy. Lucene is powerful. Lucene is exciting. Lucene is cool. Where be Lucene now?";
private BreakIterator GetSentenceInstance(System.Globalization.CultureInfo locale)
{
return JdkBreakIterator.GetSentenceInstance(locale);
}
[Test]
[LuceneNetSpecific]
public void TestSentenceIteration()
{
BreakIterator bi = GetSentenceInstance(System.Globalization.CultureInfo.InvariantCulture);
bi.SetText("");
// Test empty
Assert.AreEqual(0, bi.Current);
Assert.AreEqual(BreakIterator.Done, bi.Next());
Assert.AreEqual(0, bi.Current);
bi.SetText(SENTENCE_TEXT);
// Ensure position starts at 0 when initialized
Assert.AreEqual(0, bi.Current);
// Check first boundary (in Java.^) - Ensure we don't break on \n
Assert.AreEqual(108, bi.Next());
// Ensure Current returns the most recent boundary
Assert.AreEqual(108, bi.Current);
// Check next boundary (especially cross-platform.^)
Assert.AreEqual(221, bi.Next());
// Check next boundary (free download.^)
Assert.AreEqual(290, bi.Next());
// Check next boundary (things easy.^)
Assert.AreEqual(324, bi.Next());
// Check next boundary (is powerful.^)
Assert.AreEqual(344, bi.Next());
// Check next boundary (is exciting.^)
Assert.AreEqual(364, bi.Next());
// Check next boundary (is cool.^)
Assert.AreEqual(380, bi.Next());
// Check last boundary (Lucene now?^)
Assert.AreEqual(400, bi.Next());
// Check move past last boundary
Assert.AreEqual(BreakIterator.Done, bi.Next());
// Ensure we are still at last boundary
Assert.AreEqual(400, bi.Current);
// Check MovePrevious
Assert.AreEqual(380, bi.Previous());
// Ensure we get the same value for Current as the last move
Assert.AreEqual(380, bi.Current);
// Check MoveFirst
Assert.AreEqual(0, bi.First());
// Ensure we get the same value for Current as the last move
Assert.AreEqual(0, bi.Current);
// Check moving beyond first boundary
Assert.AreEqual(BreakIterator.Done, bi.Previous());
// Ensure we are still at first boundary
Assert.AreEqual(0, bi.Current);
// Check MoveLast()
Assert.AreEqual(400, bi.Last());
}
// NOTE: This test doesn't pass. We need to customize line iteration in order to get it to. However,
// none of the defaults set in lucene use line iteration, so this is low priority. Leaving in place
// in case we need to make JDK style line breaks in the future.
const String LINE_TEXT =
"Apache\tLucene(TM) is a high-\nperformance, full-featured text search engine library written entirely in Java.";
private BreakIterator GetLineInstance(System.Globalization.CultureInfo locale)
{
return BreakIterator.GetLineInstance(locale);
}
[Test]
[Ignore("Not required to confirm compatibility with Java, as this is not required by Lucene's tests.")]
public void TestLineIteration()
{
BreakIterator bi = GetLineInstance(System.Globalization.CultureInfo.InvariantCulture);
// Test empty
Assert.AreEqual(0, bi.Current);
Assert.AreEqual(BreakIterator.Done, bi.Next());
Assert.AreEqual(0, bi.Current);
bi.SetText(LINE_TEXT);
// Ensure position starts at 0 when initialized
Assert.AreEqual(0, bi.Current);
// Check first boundary (Apache\t^Lucene) - Ensure we break on \t
Assert.AreEqual(7, bi.Next());
// Ensure Current returns the most recent boundary
Assert.AreEqual(7, bi.Current);
// Check next boundary (Lucene^(TM))
Assert.AreEqual(13, bi.Next());
// Ensure Current returns the most recent boundary
Assert.AreEqual(13, bi.Current);
// Check next boundary (Lucene(TM) ^is a)
Assert.AreEqual(18, bi.Next());
// Ensure Current returns the most recent boundary
Assert.AreEqual(18, bi.Current);
// Move to start of high-performance
bi.Next();
bi.Next();
// Check next boundary (high-\n^performance)
Assert.AreEqual(29, bi.Next());
// Check last boundary (in Java.^)
Assert.AreEqual(108, bi.Last());
// Check move past last boundary
Assert.AreEqual(BreakIterator.Done, bi.Next());
// Ensure we are still at last boundary
Assert.AreEqual(108, bi.Current);
// Check MovePrevious
Assert.AreEqual(103, bi.Previous());
// Ensure we get the same value for Current as the last move
Assert.AreEqual(103, bi.Current);
// Check MoveFirst
Assert.AreEqual(0, bi.First());
// Ensure we get the same value for Current as the last move
Assert.AreEqual(0, bi.Current);
// Check moving beyond first boundary
Assert.AreEqual(BreakIterator.Done, bi.Previous());
// Ensure we are still at first boundary
Assert.AreEqual(0, bi.Current);
// Check MoveLast()
Assert.AreEqual(108, bi.Last());
}
}
}