blob: 841e38879654b7a69f3f9e7c945443603ad78ef6 [file] [log] [blame]
using J2N;
using J2N.Text;
using System;
using System.IO;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Util;
using NUnit.Framework;
namespace Lucene.Net.Analysis.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Testcase for <seealso cref="CharTokenizer"/> subclasses
/// </summary>
[TestFixture]
public class TestCharTokenizers : BaseTokenStreamTestCase
{
/*
* test to read surrogate pairs without loosing the pairing
* if the surrogate pair is at the border of the internal IO buffer
*/
[Test]
public virtual void TestReadSupplementaryChars()
{
var builder = new StringBuilder();
// create random input
var num = 1024 + Random.Next(1024);
num *= RANDOM_MULTIPLIER;
for (var i = 1; i < num; i++)
{
builder.Append("\ud801\udc1cabc");
if ((i % 10) == 0)
{
builder.Append(" ");
}
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.Insert(1023, "\ud801\udc1c");
var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
AssertTokenStreamContents(tokenizer, builder.ToString().ToLowerInvariant().Split(' ').TrimEnd());
}
/*
* test to extend the buffer TermAttribute buffer internally. If the internal
* alg that extends the size of the char array only extends by 1 char and the
* next char to be filled in is a supplementary codepoint (using 2 chars) an
* index out of bound exception is triggered.
*/
[Test]
public virtual void TestExtendCharBuffer()
{
for (var i = 0; i < 40; i++)
{
var builder = new StringBuilder();
for (int j = 0; j < 1 + i; j++)
{
builder.Append("a");
}
builder.Append("\ud801\udc1cabc");
var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString()));
AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant() });
}
}
/*
* tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
*/
[Test]
public virtual void TestMaxWordLength()
{
var builder = new StringBuilder();
for (var i = 0; i < 255; i++)
{
builder.Append("A");
}
var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString() + builder.ToString()));
AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant(), builder.ToString().ToLowerInvariant() });
}
/*
* tests the max word length of 255 with a surrogate pair at position 255
*/
[Test]
public virtual void TestMaxWordLengthWithSupplementary()
{
var builder = new StringBuilder();
for (var i = 0; i < 254; i++)
{
builder.Append("A");
}
builder.Append("\ud801\udc1c");
var tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.ToString() + builder.ToString()));
AssertTokenStreamContents(tokenizer, new[] { builder.ToString().ToLowerInvariant(), builder.ToString().ToLowerInvariant() });
}
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
[Test]
public virtual void TestCrossPlaneNormalization()
{
var analyzer = new AnalyzerAnonymousInnerClassHelper();
var num = 1000 * RANDOM_MULTIPLIER;
for (var i = 0; i < num; i++)
{
var s = TestUtil.RandomUnicodeString(Random);
var ts = analyzer.GetTokenStream("foo", s);
try
{
ts.Reset();
var offsetAtt = ts.AddAttribute<IOffsetAttribute>();
while (ts.IncrementToken())
{
var highlightedText = s.Substring(offsetAtt.StartOffset, offsetAtt.EndOffset - offsetAtt.StartOffset);
for (int j = 0, cp = 0; j < highlightedText.Length; j += Character.CharCount(cp))
{
cp = char.ConvertToUtf32(highlightedText, j);
assertTrue("non-letter:" + cp.ToString("x"), Character.IsLetter(cp));
}
}
ts.End();
}
finally
{
IOUtils.DisposeWhileHandlingException(ts);
}
}
// just for fun
CheckRandomData(Random, analyzer, num);
}
private sealed class AnalyzerAnonymousInnerClassHelper : Analyzer
{
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
private sealed class LetterTokenizerAnonymousInnerClassHelper : LetterTokenizer
{
public LetterTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, TextReader reader)
: base(TEST_VERSION_CURRENT, reader)
{
}
protected override int Normalize(int c)
{
if (c > 0xffff)
{
return 'δ';
}
else
{
return c;
}
}
}
}
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
[Test]
public virtual void TestCrossPlaneNormalization2()
{
var analyzer = new AnalyzerAnonymousInnerClassHelper2();
var num = 1000 * RANDOM_MULTIPLIER;
for (var i = 0; i < num; i++)
{
var s = TestUtil.RandomUnicodeString(Random);
var ts = analyzer.GetTokenStream("foo", s);
try
{
ts.Reset();
var offsetAtt = ts.AddAttribute<IOffsetAttribute>();
while (ts.IncrementToken())
{
string highlightedText = s.Substring(offsetAtt.StartOffset, offsetAtt.EndOffset - offsetAtt.StartOffset);
for (int j = 0, cp = 0; j < highlightedText.Length; j += Character.CharCount(cp))
{
cp = char.ConvertToUtf32(highlightedText, j);
assertTrue("non-letter:" + cp.ToString("x"), Character.IsLetter(cp));
}
}
ts.End();
}
finally
{
IOUtils.DisposeWhileHandlingException(ts);
}
}
// just for fun
CheckRandomData(Random, analyzer, num);
}
private sealed class AnalyzerAnonymousInnerClassHelper2 : Analyzer
{
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new LetterTokenizerAnonymousInnerClassHelper2(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
private sealed class LetterTokenizerAnonymousInnerClassHelper2 : LetterTokenizer
{
public LetterTokenizerAnonymousInnerClassHelper2(LuceneVersion TEST_VERSION_CURRENT, TextReader reader)
: base(TEST_VERSION_CURRENT, reader)
{
}
protected override int Normalize(int c)
{
if (c <= 0xffff)
{
return 0x1043C;
}
else
{
return c;
}
}
}
}
/// <summary>
/// LUCENENET: Added this test as proof that making the IsTokenChar parameter a char
/// is not going to work 100% of the time because of surrogate pairs.
/// </summary>
[Test]
public virtual void TestSurrogates()
{
var analyzer = new AnalyzerAnonymousInnerClassHelper3();
AssertAnalyzesTo(analyzer, "bar 123" + (char)55404 + (char)56321 + "34 5te 987", new string[] { "123𫀁34", "5", "987" });
AssertAnalyzesTo(analyzer, "787 " + (char)55297 + (char)56388 + "6" + (char)55404 + (char)56321 + " art true 734", new string[] { "787", "𐑄6𫀁", "734" });
}
private sealed class AnalyzerAnonymousInnerClassHelper3 : Analyzer
{
public AnalyzerAnonymousInnerClassHelper3()
{ }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new NumberAndSurrogatePairTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
private sealed class NumberAndSurrogatePairTokenizer : CharTokenizer
{
public NumberAndSurrogatePairTokenizer(LuceneVersion matchVersion, TextReader reader)
: base(matchVersion, reader)
{
}
protected override bool IsTokenChar(int c)
{
if (char.IsNumber((char)c))
{
return true;
}
string character = char.ConvertFromUtf32(c);
return char.IsSurrogatePair(character, 0);
}
}
}
}
}