blob: bc8afd7e8d0d25c8129093c6cb7cb4e19cb975f6 [file] [log] [blame]
using J2N;
using J2N.Text;
using Lucene.Net.Attributes;
using NUnit.Framework;
using System;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Some of this code came from the excellent Unicode
* conversion examples from:
*
* http://www.unicode.org/Public/PROGRAMS/CVTUTF
*
* Full Copyright for that code follows:
*/
/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* this source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute this Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/
/*
* Additional code came from the IBM ICU library.
*
* http://www.icu-project.org
*
* Full Copyright for that code follows.
*/
/*
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* provided that the above copyright notice(s) and this permission notice appear
* in all copies of the Software and that both the above copyright notice(s) and
* this permission notice appear in supporting documentation.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder shall not
* be used in advertising or otherwise to promote the sale, use or other
* dealings in this Software without prior written authorization of the
* copyright holder.
*/
[TestFixture]
public class TestUnicodeUtil : LuceneTestCase
{
[Test]
public virtual void TestCodePointCount()
{
// Check invalid codepoints.
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0x80, 'z', 'z', 'z'));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xc0 - 1, 'z', 'z', 'z'));
// Check 5-byte and longer sequences.
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf8, 'z', 'z', 'z'));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xfc, 'z', 'z', 'z'));
// Check improperly terminated codepoints.
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xc2));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xe2));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xe2, 0x82));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0, 0xa4));
AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0, 0xa4, 0xad));
// Check some typical examples (multibyte).
Assert.AreEqual(0, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray())));
Assert.AreEqual(3, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 'z', 'z'))));
Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xc2, 0xa2))));
Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xe2, 0x82, 0xac))));
Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xf0, 0xa4, 0xad, 0xa2))));
// And do some random stuff.
BytesRef utf8 = new BytesRef(20);
int num = AtLeast(50000);
for (int i = 0; i < num; i++)
{
string s = TestUtil.RandomUnicodeString(Random);
UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8);
assertEquals(s.CodePointCount(0, s.Length),
UnicodeUtil.CodePointCount(utf8));
}
}
private static byte[] AsByteArray(params int[] ints)
{
var asByteArray = new byte[ints.Length];
for (int i = 0; i < ints.Length; i++)
{
asByteArray[i] = (byte)ints[i];
}
return asByteArray;
}
private static void AssertcodePointCountThrowsAssertionOn(params byte[] bytes)
{
bool threwAssertion = false;
try
{
UnicodeUtil.CodePointCount(new BytesRef(bytes));
}
catch (Exception e) when (e.IsIllegalArgumentException())
{
threwAssertion = true;
}
Assert.IsTrue(threwAssertion);
}
[Test]
public virtual void TestUTF8toUTF32()
{
BytesRef utf8 = new BytesRef(20);
Int32sRef utf32 = new Int32sRef(20);
int[] codePoints = new int[20];
int num = AtLeast(50000);
for (int i = 0; i < num; i++)
{
string s = TestUtil.RandomUnicodeString(Random);
UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8);
UnicodeUtil.UTF8toUTF32(utf8, utf32);
int charUpto = 0;
int intUpto = 0;
while (charUpto < s.Length)
{
int cp = Character.CodePointAt(s, charUpto);
codePoints[intUpto++] = cp;
charUpto += Character.CharCount(cp);
}
if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto))
{
Console.WriteLine("FAILED");
for (int j = 0; j < s.Length; j++)
{
Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x"));
}
Console.WriteLine();
Assert.AreEqual(intUpto, utf32.Length);
for (int j = 0; j < intUpto; j++)
{
Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x"));
}
Assert.Fail("mismatch");
}
}
}
[Test, LuceneNetSpecific]
public virtual void TestUTF8toUTF32_ICharSequence()
{
BytesRef utf8 = new BytesRef(20);
Int32sRef utf32 = new Int32sRef(20);
int[] codePoints = new int[20];
int num = AtLeast(50000);
for (int i = 0; i < num; i++)
{
string s = TestUtil.RandomUnicodeString(Random);
UnicodeUtil.UTF16toUTF8(s.AsCharSequence(), 0, s.Length, utf8);
UnicodeUtil.UTF8toUTF32(utf8, utf32);
int charUpto = 0;
int intUpto = 0;
while (charUpto < s.Length)
{
int cp = Character.CodePointAt(s, charUpto);
codePoints[intUpto++] = cp;
charUpto += Character.CharCount(cp);
}
if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto))
{
Console.WriteLine("FAILED");
for (int j = 0; j < s.Length; j++)
{
Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x"));
}
Console.WriteLine();
Assert.AreEqual(intUpto, utf32.Length);
for (int j = 0; j < intUpto; j++)
{
Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x"));
}
Assert.Fail("mismatch");
}
}
}
[Test, LuceneNetSpecific]
public virtual void TestUTF8toUTF32_CharArray()
{
BytesRef utf8 = new BytesRef(20);
Int32sRef utf32 = new Int32sRef(20);
int[] codePoints = new int[20];
int num = AtLeast(50000);
for (int i = 0; i < num; i++)
{
string s = TestUtil.RandomUnicodeString(Random);
UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8);
UnicodeUtil.UTF8toUTF32(utf8, utf32);
int charUpto = 0;
int intUpto = 0;
while (charUpto < s.Length)
{
int cp = Character.CodePointAt(s, charUpto);
codePoints[intUpto++] = cp;
charUpto += Character.CharCount(cp);
}
if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto))
{
Console.WriteLine("FAILED");
for (int j = 0; j < s.Length; j++)
{
Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x"));
}
Console.WriteLine();
Assert.AreEqual(intUpto, utf32.Length);
for (int j = 0; j < intUpto; j++)
{
Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x"));
}
Assert.Fail("mismatch");
}
}
}
[Test]
public virtual void TestNewString()
{
int[] codePoints = new int[] { Character.ToCodePoint(Character.MinHighSurrogate, Character.MaxLowSurrogate), Character.ToCodePoint(Character.MaxHighSurrogate, Character.MinLowSurrogate), Character.MaxHighSurrogate, 'A', -1 };
string cpString = "" + Character.MinHighSurrogate + Character.MaxLowSurrogate + Character.MaxHighSurrogate + Character.MinLowSurrogate + Character.MaxHighSurrogate + 'A';
int[][] tests = new int[][] { new int[] { 0, 1, 0, 2 }, new int[] { 0, 2, 0, 4 }, new int[] { 1, 1, 2, 2 }, new int[] { 1, 2, 2, 3 }, new int[] { 1, 3, 2, 4 }, new int[] { 2, 2, 4, 2 }, new int[] { 2, 3, 0, -1 }, new int[] { 4, 5, 0, -1 }, new int[] { 3, -1, 0, -1 } };
for (int i = 0; i < tests.Length; ++i)
{
int[] t = tests[i];
int s = t[0];
int c = t[1];
int rs = t[2];
int rc = t[3];
try
{
string str = UnicodeUtil.NewString(codePoints, s, c);
Assert.IsFalse(rc == -1);
Assert.AreEqual(cpString.Substring(rs, rc), str);
continue;
}
catch (Exception e1) when (e1.IsIndexOutOfBoundsException())
{
// Ignored.
}
catch (Exception e2) when (e2.IsIllegalArgumentException())
{
// Ignored.
}
Assert.IsTrue(rc == -1);
}
}
[Test]
public virtual void TestUTF8UTF16CharsRef()
{
int num = AtLeast(3989);
for (int i = 0; i < num; i++)
{
string unicode = TestUtil.RandomRealisticUnicodeString(Random);
BytesRef @ref = new BytesRef(unicode);
char[] arr = new char[1 + Random.Next(100)];
int offset = Random.Next(arr.Length);
int len = Random.Next(arr.Length - offset);
CharsRef cRef = new CharsRef(arr, offset, len);
UnicodeUtil.UTF8toUTF16(@ref, cRef);
Assert.AreEqual(cRef.ToString(), unicode);
}
}
[Test]
[LuceneNetSpecific]
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
{
var scratch = new CharsRef();
if (shouldThrow)
{
Assert.Throws<FormatException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
}
else
{
UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
}
}
[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
[Repeat(100)]
public void TestTryUTF8toUTF16()
{
string unicode = TestUtil.RandomRealisticUnicodeString(Random);
var utf8 = new BytesRef(IOUtils.ENCODING_UTF_8_NO_BOM.GetBytes(unicode));
bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars);
Assert.IsTrue(success);
Assert.AreEqual(unicode, chars?.ToString());
}
[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
{
var scratch = new CharsRef();
UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);
Assert.AreEqual(expected, scratch.ToString());
}
}
}