| using J2N.Text; |
| using Lucene.Net.Attributes; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Util; |
| using NUnit.Framework; |
| using System; |
| using System.Collections.Generic; |
| using System.Text; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Index |
| { |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using CharsRef = Lucene.Net.Util.CharsRef; |
| using Directory = Lucene.Net.Store.Directory; |
| using Document = Documents.Document; |
| using Field = Field; |
| using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; |
| using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; |
| |
| [TestFixture] |
| public class TestIndexWriterUnicode : LuceneTestCase |
| { |
| internal readonly string[] Utf8Data = new string[] { "ab\udc17cd", "ab\ufffdcd", "\udc17abcd", "\ufffdabcd", "\udc17", "\ufffd", "ab\udc17\udc17cd", "ab\ufffd\ufffdcd", "\udc17\udc17abcd", "\ufffd\ufffdabcd", "\udc17\udc17", "\ufffd\ufffd", "ab\ud917cd", "ab\ufffdcd", "\ud917abcd", "\ufffdabcd", "\ud917", "\ufffd", "ab\ud917\ud917cd", "ab\ufffd\ufffdcd", "\ud917\ud917abcd", "\ufffd\ufffdabcd", "\ud917\ud917", "\ufffd\ufffd", "ab\udc17\ud917cd", "ab\ufffd\ufffdcd", "\udc17\ud917abcd", "\ufffd\ufffdabcd", "\udc17\ud917", "\ufffd\ufffd", "ab\udc17\ud917\udc17\ud917cd", "ab\ufffd\ud917\udc17\ufffdcd", "\udc17\ud917\udc17\ud917abcd", "\ufffd\ud917\udc17\ufffdabcd", "\udc17\ud917\udc17\ud917", "\ufffd\ud917\udc17\ufffd" }; |
| |
| private int NextInt(int lim) |
| { |
| return Random.Next(lim); |
| } |
| |
| private int NextInt(int start, int end) |
| { |
| return start + NextInt(end - start); |
| } |
| |
| private bool FillUnicode(char[] buffer, char[] expected, int offset, int count) |
| { |
| int len = offset + count; |
| bool hasIllegal = false; |
| |
| if (offset > 0 && buffer[offset] >= 0xdc00 && buffer[offset] < 0xe000) |
| // Don't start in the middle of a valid surrogate pair |
| { |
| offset--; |
| } |
| |
| for (int i = offset; i < len; i++) |
| { |
| int t = NextInt(6); |
| if (0 == t && i < len - 1) |
| { |
| // Make a surrogate pair |
| // High surrogate |
| expected[i] = buffer[i++] = (char)NextInt(0xd800, 0xdc00); |
| // Low surrogate |
| expected[i] = buffer[i] = (char)NextInt(0xdc00, 0xe000); |
| } |
| else if (t <= 1) |
| { |
| expected[i] = buffer[i] = (char)NextInt(0x80); |
| } |
| else if (2 == t) |
| { |
| expected[i] = buffer[i] = (char)NextInt(0x80, 0x800); |
| } |
| else if (3 == t) |
| { |
| expected[i] = buffer[i] = (char)NextInt(0x800, 0xd800); |
| } |
| else if (4 == t) |
| { |
| expected[i] = buffer[i] = (char)NextInt(0xe000, 0xffff); |
| } |
| else if (5 == t && i < len - 1) |
| { |
| // Illegal unpaired surrogate |
| if (NextInt(10) == 7) |
| { |
| if (Random.NextBoolean()) |
| { |
| buffer[i] = (char)NextInt(0xd800, 0xdc00); |
| } |
| else |
| { |
| buffer[i] = (char)NextInt(0xdc00, 0xe000); |
| } |
| expected[i++] = (char)0xfffd; |
| expected[i] = buffer[i] = (char)NextInt(0x800, 0xd800); |
| hasIllegal = true; |
| } |
| else |
| { |
| expected[i] = buffer[i] = (char)NextInt(0x800, 0xd800); |
| } |
| } |
| else |
| { |
| expected[i] = buffer[i] = ' '; |
| } |
| } |
| |
| return hasIllegal; |
| } |
| |
| // both start & end are inclusive |
| private int GetInt(Random r, int start, int end) |
| { |
| return start + r.Next(1 + end - start); |
| } |
| |
| private string AsUnicodeChar(char c) |
| { |
| return "U+" + ((int)c).ToString("x"); |
| } |
| |
| private string TermDesc(string s) |
| { |
| string s0; |
| Assert.IsTrue(s.Length <= 2); |
| if (s.Length == 1) |
| { |
| s0 = AsUnicodeChar(s[0]); |
| } |
| else |
| { |
| s0 = AsUnicodeChar(s[0]) + "," + AsUnicodeChar(s[1]); |
| } |
| return s0; |
| } |
| |
| private void CheckTermsOrder(IndexReader r, ISet<string> allTerms, bool isTop) |
| { |
| TermsEnum terms = MultiFields.GetFields(r).GetTerms("f").GetIterator(null); |
| |
| BytesRef last = new BytesRef(); |
| |
| ISet<string> seenTerms = new JCG.HashSet<string>(); |
| |
| while (true) |
| { |
| BytesRef term = terms.Next(); |
| if (term == null) |
| { |
| break; |
| } |
| |
| Assert.IsTrue(last.CompareTo(term) < 0); |
| last.CopyBytes(term); |
| |
| string s = term.Utf8ToString(); |
| Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")"); |
| seenTerms.Add(s); |
| } |
| |
| if (isTop) |
| { |
| Assert.IsTrue(allTerms.SetEquals(seenTerms)); |
| } |
| |
| // Test seeking: |
| IEnumerator<string> it = seenTerms.GetEnumerator(); |
| while (it.MoveNext()) |
| { |
| BytesRef tr = new BytesRef(it.Current); |
| Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString())); |
| } |
| } |
| |
| // LUCENE-510 |
| [Test, LongRunningTest] |
| public virtual void TestRandomUnicodeStrings() |
| { |
| char[] buffer = new char[20]; |
| char[] expected = new char[20]; |
| |
| BytesRef utf8 = new BytesRef(20); |
| CharsRef utf16 = new CharsRef(20); |
| |
| int num = AtLeast(100000); |
| for (int iter = 0; iter < num; iter++) |
| { |
| bool hasIllegal = FillUnicode(buffer, expected, 0, 20); |
| |
| UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); |
| if (!hasIllegal) |
| { |
| #pragma warning disable 612, 618 |
| var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); |
| #pragma warning restore 612, 618 |
| Assert.AreEqual(b.Length, utf8.Length); |
| for (int i = 0; i < b.Length; i++) |
| { |
| Assert.AreEqual(b[i], utf8.Bytes[i]); |
| } |
| } |
| |
| UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); |
| Assert.AreEqual(utf16.Length, 20); |
| for (int i = 0; i < 20; i++) |
| { |
| Assert.AreEqual(expected[i], utf16.Chars[i]); |
| } |
| } |
| } |
| |
| // LUCENE-510 |
| [Test, LongRunningTest] |
| public virtual void TestAllUnicodeChars() |
| { |
| BytesRef utf8 = new BytesRef(10); |
| CharsRef utf16 = new CharsRef(10); |
| char[] chars = new char[2]; |
| for (int ch = 0; ch < 0x0010FFFF; ch++) |
| { |
| if (ch == 0xd800) |
| // Skip invalid code points |
| { |
| ch = 0xe000; |
| } |
| |
| int len = 0; |
| if (ch <= 0xffff) |
| { |
| chars[len++] = (char)ch; |
| } |
| else |
| { |
| chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); |
| chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); |
| } |
| |
| UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); |
| |
| string s1 = new string(chars, 0, len); |
| string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length); |
| Assert.AreEqual(s1, s2, "codepoint " + ch); |
| |
| UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); |
| Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch); |
| |
| var b = s1.GetBytes(Encoding.UTF8); |
| Assert.AreEqual(utf8.Length, b.Length); |
| for (int j = 0; j < utf8.Length; j++) |
| { |
| Assert.AreEqual(utf8.Bytes[j], b[j]); |
| } |
| } |
| } |
| |
| [Test] |
| public virtual void TestEmbeddedFFFF() |
| { |
| Directory d = NewDirectory(); |
| IndexWriter w = new IndexWriter(d, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); |
| Document doc = new Document(); |
| doc.Add(NewTextField("field", "a a\uffffb", Field.Store.NO)); |
| w.AddDocument(doc); |
| doc = new Document(); |
| doc.Add(NewTextField("field", "a", Field.Store.NO)); |
| w.AddDocument(doc); |
| IndexReader r = w.GetReader(); |
| Assert.AreEqual(1, r.DocFreq(new Term("field", "a\uffffb"))); |
| r.Dispose(); |
| w.Dispose(); |
| d.Dispose(); |
| } |
| |
| // LUCENE-510 |
| [Test] |
| public virtual void TestInvalidUTF16() |
| { |
| Directory dir = NewDirectory(); |
| IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new TestIndexWriter.StringSplitAnalyzer())); |
| Document doc = new Document(); |
| |
| int count = Utf8Data.Length / 2; |
| for (int i = 0; i < count; i++) |
| { |
| doc.Add(NewTextField("f" + i, Utf8Data[2 * i], Field.Store.YES)); |
| } |
| w.AddDocument(doc); |
| w.Dispose(); |
| |
| IndexReader ir = DirectoryReader.Open(dir); |
| Document doc2 = ir.Document(0); |
| for (int i = 0; i < count; i++) |
| { |
| Assert.AreEqual(1, ir.DocFreq(new Term("f" + i, Utf8Data[2 * i + 1])), "field " + i + " was not indexed correctly"); |
| Assert.AreEqual(Utf8Data[2 * i + 1], doc2.GetField("f" + i).GetStringValue(), "field " + i + " is incorrect"); |
| } |
| ir.Dispose(); |
| dir.Dispose(); |
| } |
| |
| // Make sure terms, including ones with surrogate pairs, |
| // sort in codepoint sort order by default |
| [Test] |
| public virtual void TestTermUTF16SortOrder() |
| { |
| Random rnd = Random; |
| Directory dir = NewDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter( |
| #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION |
| this, |
| #endif |
| rnd, dir); |
| Document d = new Document(); |
| // Single segment |
| Field f = NewStringField("f", "", Field.Store.NO); |
| d.Add(f); |
| char[] chars = new char[2]; |
| ISet<string> allTerms = new JCG.HashSet<string>(); |
| |
| int num = AtLeast(200); |
| for (int i = 0; i < num; i++) |
| { |
| string s; |
| if (rnd.NextBoolean()) |
| { |
| // Single char |
| if (rnd.NextBoolean()) |
| { |
| // Above surrogates |
| chars[0] = (char)GetInt(rnd, 1 + UnicodeUtil.UNI_SUR_LOW_END, 0xffff); |
| } |
| else |
| { |
| // Below surrogates |
| chars[0] = (char)GetInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START - 1); |
| } |
| s = new string(chars, 0, 1); |
| } |
| else |
| { |
| // Surrogate pair |
| chars[0] = (char)GetInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END); |
| Assert.IsTrue(((int)chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int)chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END); |
| chars[1] = (char)GetInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END); |
| s = new string(chars, 0, 2); |
| } |
| allTerms.Add(s); |
| f.SetStringValue(s); |
| |
| writer.AddDocument(d); |
| |
| if ((1 + i) % 42 == 0) |
| { |
| writer.Commit(); |
| } |
| } |
| |
| IndexReader r = writer.GetReader(); |
| |
| // Test each sub-segment |
| foreach (AtomicReaderContext ctx in r.Leaves) |
| { |
| CheckTermsOrder(ctx.Reader, allTerms, false); |
| } |
| CheckTermsOrder(r, allTerms, true); |
| |
| // Test multi segment |
| r.Dispose(); |
| |
| writer.ForceMerge(1); |
| |
| // Test single segment |
| r = writer.GetReader(); |
| CheckTermsOrder(r, allTerms, true); |
| r.Dispose(); |
| |
| writer.Dispose(); |
| dir.Dispose(); |
| } |
| } |
| } |