| using J2N; |
| using J2N.Text; |
| using Lucene.Net.Attributes; |
| using NUnit.Framework; |
| using System; |
| using Assert = Lucene.Net.TestFramework.Assert; |
| |
| namespace Lucene.Net.Util |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* |
| * Some of this code came from the excellent Unicode |
| * conversion examples from: |
| * |
| * http://www.unicode.org/Public/PROGRAMS/CVTUTF |
| * |
| * Full Copyright for that code follows: |
| */ |
| |
| /* |
| * Copyright 2001-2004 Unicode, Inc. |
| * |
| * Disclaimer |
| * |
| * this source code is provided as is by Unicode, Inc. No claims are |
| * made as to fitness for any particular purpose. No warranties of any |
| * kind are expressed or implied. The recipient agrees to determine |
| * applicability of information provided. If this file has been |
| * purchased on magnetic or optical media from Unicode, Inc., the |
| * sole remedy for any claim will be exchange of defective media |
| * within 90 days of receipt. |
| * |
| * Limitations on Rights to Redistribute this Code |
| * |
| * Unicode, Inc. hereby grants the right to freely use the information |
| * supplied in this file in the creation of products supporting the |
| * Unicode Standard, and to make copies of this file in any form |
| * for internal or external distribution as long as this notice |
| * remains attached. |
| */ |
| |
| /* |
| * Additional code came from the IBM ICU library. |
| * |
| * http://www.icu-project.org |
| * |
| * Full Copyright for that code follows. |
| */ |
| |
| /* |
| * Copyright (C) 1999-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, and/or sell copies of the |
| * Software, and to permit persons to whom the Software is furnished to do so, |
| * provided that the above copyright notice(s) and this permission notice appear |
| * in all copies of the Software and that both the above copyright notice(s) and |
| * this permission notice appear in supporting documentation. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
| * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE |
| * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR |
| * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER |
| * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| * |
| * Except as contained in this notice, the name of a copyright holder shall not |
| * be used in advertising or otherwise to promote the sale, use or other |
| * dealings in this Software without prior written authorization of the |
| * copyright holder. |
| */ |
| |
| [TestFixture] |
| public class TestUnicodeUtil : LuceneTestCase |
| { |
| [Test] |
| public virtual void TestCodePointCount() |
| { |
| // Check invalid codepoints. |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0x80, 'z', 'z', 'z')); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xc0 - 1, 'z', 'z', 'z')); |
| // Check 5-byte and longer sequences. |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf8, 'z', 'z', 'z')); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xfc, 'z', 'z', 'z')); |
| // Check improperly terminated codepoints. |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xc2)); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xe2)); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xe2, 0x82)); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0)); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0, 0xa4)); |
| AssertcodePointCountThrowsAssertionOn(AsByteArray('z', 0xf0, 0xa4, 0xad)); |
| |
| // Check some typical examples (multibyte). |
| Assert.AreEqual(0, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray()))); |
| Assert.AreEqual(3, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 'z', 'z')))); |
| Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xc2, 0xa2)))); |
| Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xe2, 0x82, 0xac)))); |
| Assert.AreEqual(2, UnicodeUtil.CodePointCount(new BytesRef(AsByteArray('z', 0xf0, 0xa4, 0xad, 0xa2)))); |
| |
| // And do some random stuff. |
| BytesRef utf8 = new BytesRef(20); |
| int num = AtLeast(50000); |
| for (int i = 0; i < num; i++) |
| { |
| string s = TestUtil.RandomUnicodeString(Random); |
| UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8); |
| assertEquals(s.CodePointCount(0, s.Length), |
| UnicodeUtil.CodePointCount(utf8)); |
| } |
| } |
| |
| private static byte[] AsByteArray(params int[] ints) |
| { |
| var asByteArray = new byte[ints.Length]; |
| for (int i = 0; i < ints.Length; i++) |
| { |
| asByteArray[i] = (byte)ints[i]; |
| } |
| return asByteArray; |
| } |
| |
| private static void AssertcodePointCountThrowsAssertionOn(params byte[] bytes) |
| { |
| bool threwAssertion = false; |
| try |
| { |
| UnicodeUtil.CodePointCount(new BytesRef(bytes)); |
| } |
| catch (Exception e) when (e.IsIllegalArgumentException()) |
| { |
| threwAssertion = true; |
| } |
| Assert.IsTrue(threwAssertion); |
| } |
| |
| [Test] |
| public virtual void TestUTF8toUTF32() |
| { |
| BytesRef utf8 = new BytesRef(20); |
| Int32sRef utf32 = new Int32sRef(20); |
| int[] codePoints = new int[20]; |
| int num = AtLeast(50000); |
| for (int i = 0; i < num; i++) |
| { |
| string s = TestUtil.RandomUnicodeString(Random); |
| UnicodeUtil.UTF16toUTF8(s, 0, s.Length, utf8); |
| UnicodeUtil.UTF8toUTF32(utf8, utf32); |
| |
| int charUpto = 0; |
| int intUpto = 0; |
| |
| while (charUpto < s.Length) |
| { |
| int cp = Character.CodePointAt(s, charUpto); |
| codePoints[intUpto++] = cp; |
| charUpto += Character.CharCount(cp); |
| } |
| if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto)) |
| { |
| Console.WriteLine("FAILED"); |
| for (int j = 0; j < s.Length; j++) |
| { |
| Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x")); |
| } |
| Console.WriteLine(); |
| Assert.AreEqual(intUpto, utf32.Length); |
| for (int j = 0; j < intUpto; j++) |
| { |
| Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x")); |
| } |
| Assert.Fail("mismatch"); |
| } |
| } |
| } |
| |
| [Test, LuceneNetSpecific] |
| public virtual void TestUTF8toUTF32_ICharSequence() |
| { |
| BytesRef utf8 = new BytesRef(20); |
| Int32sRef utf32 = new Int32sRef(20); |
| int[] codePoints = new int[20]; |
| int num = AtLeast(50000); |
| for (int i = 0; i < num; i++) |
| { |
| string s = TestUtil.RandomUnicodeString(Random); |
| UnicodeUtil.UTF16toUTF8(s.AsCharSequence(), 0, s.Length, utf8); |
| UnicodeUtil.UTF8toUTF32(utf8, utf32); |
| |
| int charUpto = 0; |
| int intUpto = 0; |
| |
| while (charUpto < s.Length) |
| { |
| int cp = Character.CodePointAt(s, charUpto); |
| codePoints[intUpto++] = cp; |
| charUpto += Character.CharCount(cp); |
| } |
| if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto)) |
| { |
| Console.WriteLine("FAILED"); |
| for (int j = 0; j < s.Length; j++) |
| { |
| Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x")); |
| } |
| Console.WriteLine(); |
| Assert.AreEqual(intUpto, utf32.Length); |
| for (int j = 0; j < intUpto; j++) |
| { |
| Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x")); |
| } |
| Assert.Fail("mismatch"); |
| } |
| } |
| } |
| |
| [Test, LuceneNetSpecific] |
| public virtual void TestUTF8toUTF32_CharArray() |
| { |
| BytesRef utf8 = new BytesRef(20); |
| Int32sRef utf32 = new Int32sRef(20); |
| int[] codePoints = new int[20]; |
| int num = AtLeast(50000); |
| for (int i = 0; i < num; i++) |
| { |
| string s = TestUtil.RandomUnicodeString(Random); |
| UnicodeUtil.UTF16toUTF8(s.ToCharArray(), 0, s.Length, utf8); |
| UnicodeUtil.UTF8toUTF32(utf8, utf32); |
| |
| int charUpto = 0; |
| int intUpto = 0; |
| |
| while (charUpto < s.Length) |
| { |
| int cp = Character.CodePointAt(s, charUpto); |
| codePoints[intUpto++] = cp; |
| charUpto += Character.CharCount(cp); |
| } |
| if (!ArrayUtil.Equals(codePoints, 0, utf32.Int32s, utf32.Offset, intUpto)) |
| { |
| Console.WriteLine("FAILED"); |
| for (int j = 0; j < s.Length; j++) |
| { |
| Console.WriteLine(" char[" + j + "]=" + ((int)s[j]).ToString("x")); |
| } |
| Console.WriteLine(); |
| Assert.AreEqual(intUpto, utf32.Length); |
| for (int j = 0; j < intUpto; j++) |
| { |
| Console.WriteLine(" " + utf32.Int32s[j].ToString("x") + " vs " + codePoints[j].ToString("x")); |
| } |
| Assert.Fail("mismatch"); |
| } |
| } |
| } |
| |
| [Test] |
| public virtual void TestNewString() |
| { |
| int[] codePoints = new int[] { Character.ToCodePoint(Character.MinHighSurrogate, Character.MaxLowSurrogate), Character.ToCodePoint(Character.MaxHighSurrogate, Character.MinLowSurrogate), Character.MaxHighSurrogate, 'A', -1 }; |
| |
| string cpString = "" + Character.MinHighSurrogate + Character.MaxLowSurrogate + Character.MaxHighSurrogate + Character.MinLowSurrogate + Character.MaxHighSurrogate + 'A'; |
| |
| int[][] tests = new int[][] { new int[] { 0, 1, 0, 2 }, new int[] { 0, 2, 0, 4 }, new int[] { 1, 1, 2, 2 }, new int[] { 1, 2, 2, 3 }, new int[] { 1, 3, 2, 4 }, new int[] { 2, 2, 4, 2 }, new int[] { 2, 3, 0, -1 }, new int[] { 4, 5, 0, -1 }, new int[] { 3, -1, 0, -1 } }; |
| |
| for (int i = 0; i < tests.Length; ++i) |
| { |
| int[] t = tests[i]; |
| int s = t[0]; |
| int c = t[1]; |
| int rs = t[2]; |
| int rc = t[3]; |
| |
| try |
| { |
| string str = UnicodeUtil.NewString(codePoints, s, c); |
| Assert.IsFalse(rc == -1); |
| Assert.AreEqual(cpString.Substring(rs, rc), str); |
| continue; |
| } |
| catch (Exception e1) when (e1.IsIndexOutOfBoundsException()) |
| { |
| // Ignored. |
| } |
| catch (Exception e2) when (e2.IsIllegalArgumentException()) |
| { |
| // Ignored. |
| } |
| Assert.IsTrue(rc == -1); |
| } |
| } |
| |
| [Test] |
| public virtual void TestUTF8UTF16CharsRef() |
| { |
| int num = AtLeast(3989); |
| for (int i = 0; i < num; i++) |
| { |
| string unicode = TestUtil.RandomRealisticUnicodeString(Random); |
| BytesRef @ref = new BytesRef(unicode); |
| char[] arr = new char[1 + Random.Next(100)]; |
| int offset = Random.Next(arr.Length); |
| int len = Random.Next(arr.Length - offset); |
| CharsRef cRef = new CharsRef(arr, offset, len); |
| UnicodeUtil.UTF8toUTF16(@ref, cRef); |
| Assert.AreEqual(cRef.ToString(), unicode); |
| } |
| } |
| |
| [Test] |
| [LuceneNetSpecific] |
| [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon |
| public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow) |
| { |
| var scratch = new CharsRef(); |
| |
| if (shouldThrow) |
| { |
| Assert.Throws<FormatException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch)); |
| } |
| else |
| { |
| UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch); |
| } |
| } |
| |
| [Test] |
| [LuceneNetSpecific] // this is a Lucene.NET specific method |
| [Repeat(100)] |
| public void TestTryUTF8toUTF16() |
| { |
| string unicode = TestUtil.RandomRealisticUnicodeString(Random); |
| var utf8 = new BytesRef(IOUtils.ENCODING_UTF_8_NO_BOM.GetBytes(unicode)); |
| |
| bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars); |
| |
| Assert.IsTrue(success); |
| Assert.AreEqual(unicode, chars?.ToString()); |
| } |
| |
| [Test] |
| [LuceneNetSpecific] // this is a Lucene.NET specific method |
| [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence |
| [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")] |
| public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected) |
| { |
| var scratch = new CharsRef(); |
| |
| UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch); |
| |
| Assert.AreEqual(expected, scratch.ToString()); |
| } |
| } |
| } |