| using J2N; |
| using J2N.Text; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using System; |
| using System.Diagnostics; |
| using System.Diagnostics.CodeAnalysis; |
| using System.Globalization; |
| using System.IO; |
| using System.Text; |
| |
| namespace Lucene.Net.Analysis.Util |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// <see cref="CharacterUtils"/> provides a unified interface to Character-related |
| /// operations to implement backwards compatible character operations based on a |
| /// <see cref="LuceneVersion"/> instance. |
| /// |
| /// @lucene.internal |
| /// </summary> |
| public abstract class CharacterUtils |
| { |
| // LUCENENET specific class for supporting broken Unicode support in Lucene 3.0. |
| // See the TestCharArraySet.TestSupplementaryCharsBWCompat() |
| // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests. |
| private static readonly CharacterUtils JAVA_4_BW_COMPAT = new Java4CharacterUtilsBWCompatibility(); |
| |
| private static readonly CharacterUtils JAVA_4 = new Java4CharacterUtils(); |
| private static readonly CharacterUtils JAVA_5 = new Java5CharacterUtils(); |
| |
| /// <summary> |
| /// Returns a <see cref="CharacterUtils"/> implementation according to the given |
| /// <see cref="LuceneVersion"/> instance. |
| /// </summary> |
| /// <param name="matchVersion"> |
| /// a version instance </param> |
| /// <returns> a <see cref="CharacterUtils"/> implementation according to the given |
| /// <see cref="LuceneVersion"/> instance. </returns> |
| public static CharacterUtils GetInstance(LuceneVersion matchVersion) |
| { |
| #pragma warning disable 612, 618 |
| return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) |
| ? JAVA_5 |
| : JAVA_4_BW_COMPAT; |
| #pragma warning restore 612, 618 |
| } |
| |
| /// <summary> |
| /// Return a <see cref="CharacterUtils"/> instance compatible with Java 1.4. </summary> |
| public static CharacterUtils GetJava4Instance(LuceneVersion matchVersion) // LUCENENET specific - added matchVersion parameter so we can support backward compatible Unicode support |
| { |
| #pragma warning disable 612, 618 |
| return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_4 : JAVA_4_BW_COMPAT; |
| #pragma warning restore 612, 618 |
| } |
| |
| /// <summary> |
| /// Returns the code point at the given index of the <see cref="string"/>. |
| /// Depending on the <see cref="LuceneVersion"/> passed to |
| /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior |
| /// of <c>Character.CodePointAt(char[], int)</c> as it would have been |
| /// available on a Java 1.4 JVM or on a later virtual machine version. |
| /// </summary> |
| /// <param name="seq"> |
| /// a character sequence </param> |
| /// <param name="offset"> |
| /// the offset to the char values in the chars array to be converted |
| /// </param> |
| /// <returns> the Unicode code point at the given index </returns> |
| /// <exception cref="NullReferenceException"> |
| /// - if the sequence is null. </exception> |
| /// <exception cref="ArgumentOutOfRangeException"> |
| /// - if the value offset is negative or not less than the length of |
| /// the character sequence. </exception> |
| public abstract int CodePointAt(string seq, int offset); |
| |
| /// <summary> |
| /// Returns the code point at the given index of the <see cref="ICharSequence"/>. |
| /// Depending on the <see cref="LuceneVersion"/> passed to |
| /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior |
| /// of <c>Character.CodePointAt(char[], int)</c> as it would have been |
| /// available on a Java 1.4 JVM or on a later virtual machine version. |
| /// </summary> |
| /// <param name="seq"> |
| /// a character sequence </param> |
| /// <param name="offset"> |
| /// the offset to the char values in the chars array to be converted |
| /// </param> |
| /// <returns> the Unicode code point at the given index </returns> |
| /// <exception cref="NullReferenceException"> |
| /// - if the sequence is null. </exception> |
| /// <exception cref="ArgumentOutOfRangeException"> |
| /// - if the value offset is negative or not less than the length of |
| /// the character sequence. </exception> |
| public abstract int CodePointAt(ICharSequence seq, int offset); |
| |
| /// <summary> |
| /// Returns the code point at the given index of the char array where only elements |
| /// with index less than the limit are used. |
| /// Depending on the <see cref="LuceneVersion"/> passed to |
| /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior |
| /// of <c>Character.CodePointAt(char[], int)</c> as it would have been |
| /// available on a Java 1.4 JVM or on a later virtual machine version. |
| /// </summary> |
| /// <param name="chars"> |
| /// a character array </param> |
| /// <param name="offset"> |
| /// the offset to the char values in the chars array to be converted </param> |
| /// <param name="limit"> the index afer the last element that should be used to calculate |
| /// codepoint. |
| /// </param> |
| /// <returns> the Unicode code point at the given index </returns> |
| /// <exception cref="NullReferenceException"> |
| /// - if the array is null. </exception> |
| /// <exception cref="ArgumentOutOfRangeException"> |
| /// - if the value offset is negative or not less than the length of |
| /// the char array. </exception> |
| public abstract int CodePointAt(char[] chars, int offset, int limit); |
| |
| /// <summary> |
| /// Return the number of characters in <paramref name="seq"/>. </summary> |
| public abstract int CodePointCount(string seq); |
| |
| /// <summary> |
| /// Return the number of characters in <paramref name="seq"/>. </summary> |
| public abstract int CodePointCount(ICharSequence seq); |
| |
| /// <summary> |
| /// Return the number of characters in <paramref name="seq"/>. </summary> |
| public abstract int CodePointCount(char[] seq); |
| |
| /// <summary> |
| /// Return the number of characters in <paramref name="seq"/>. </summary> |
| public abstract int CodePointCount(StringBuilder seq); |
| |
| /// <summary> |
| /// Creates a new <see cref="CharacterBuffer"/> and allocates a <see cref="T:char[]"/> |
| /// of the given bufferSize. |
| /// </summary> |
| /// <param name="bufferSize"> |
| /// the internal char buffer size, must be <c>>= 2</c> </param> |
| /// <returns> a new <see cref="CharacterBuffer"/> instance. </returns> |
| public static CharacterBuffer NewCharacterBuffer(int bufferSize) |
| { |
| if (bufferSize < 2) |
| { |
| throw new ArgumentException("buffersize must be >= 2"); |
| } |
| return new CharacterBuffer(new char[bufferSize], 0, 0); |
| } |
| |
| |
| /// <summary> |
| /// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting |
| /// at the given offset. </summary> |
| /// <param name="buffer"> the char buffer to lowercase </param> |
| /// <param name="offset"> the offset to start at </param> |
| /// <param name="length"> the number of characters in the buffer to lower case </param> |
| public virtual void ToLower(char[] buffer, int offset, int length) // LUCENENET specific - marked virtual so we can override the default |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(buffer.Length >= length); |
| Debugging.Assert(offset <= 0 && offset <= buffer.Length); |
| } |
| |
| // Slight optimization, eliminating a few method calls internally |
| CultureInfo.InvariantCulture.TextInfo |
| .ToLower(new string(buffer, offset, length)) |
| .CopyTo(0, buffer, offset, length); |
| |
| //// Optimization provided by Vincent Van Den Berghe: |
| //// http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower |
| //new string(buffer, offset, length) |
| // .ToLowerInvariant() |
| // .CopyTo(0, buffer, offset, length); |
| |
| // Original (slow) Lucene implementation: |
| //for (int i = offset; i < limit; ) |
| //{ |
| // i += Character.ToChars( |
| // Character.ToLower( |
| // CodePointAt(buffer, i, limit), CultureInfo.InvariantCulture), buffer, i); |
| //} |
| } |
| |
| /// <summary> |
| /// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting |
| /// at the given offset. </summary> |
| /// <param name="buffer"> the char buffer to UPPERCASE </param> |
| /// <param name="offset"> the offset to start at </param> |
| /// <param name="length"> the number of characters in the buffer to lower case </param> |
| public virtual void ToUpper(char[] buffer, int offset, int length) // LUCENENET specific - marked virtual so we can override the default |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(buffer.Length >= length); |
| Debugging.Assert(offset <= 0 && offset <= buffer.Length); |
| } |
| |
| // Slight optimization, eliminating a few method calls internally |
| CultureInfo.InvariantCulture.TextInfo |
| .ToUpper(new string(buffer, offset, length)) |
| .CopyTo(0, buffer, offset, length); |
| |
| //// Optimization provided by Vincent Van Den Berghe: |
| //// http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower |
| //new string(buffer, offset, length) |
| // .ToUpperInvariant() |
| // .CopyTo(0, buffer, offset, length); |
| |
| // Original (slow) Lucene implementation: |
| //for (int i = offset; i < limit; ) |
| //{ |
| // i += Character.ToChars( |
| // Character.ToUpper( |
| // CodePointAt(buffer, i, limit), CultureInfo.InvariantCulture), buffer, i); |
| //} |
| } |
| |
| /// <summary> |
| /// Converts a sequence of .NET characters to a sequence of unicode code points. </summary> |
| /// <returns> The number of code points written to the destination buffer. </returns> |
| public int ToCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) |
| { |
| if (srcLen < 0) |
| { |
| throw new ArgumentException("srcLen must be >= 0"); |
| } |
| int codePointCount = 0; |
| for (int i = 0; i < srcLen; ) |
| { |
| int cp = CodePointAt(src, srcOff + i, srcOff + srcLen); |
| int charCount = Character.CharCount(cp); |
| dest[destOff + codePointCount++] = cp; |
| i += charCount; |
| } |
| return codePointCount; |
| } |
| |
| /// <summary> |
| /// Converts a sequence of unicode code points to a sequence of .NET characters. </summary> |
| /// <returns> the number of chars written to the destination buffer </returns> |
| public int ToChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) |
| { |
| if (srcLen < 0) |
| { |
| throw new ArgumentException("srcLen must be >= 0"); |
| } |
| int written = 0; |
| for (int i = 0; i < srcLen; ++i) |
| { |
| written += Character.ToChars(src[srcOff + i], dest, destOff + written); |
| } |
| return written; |
| } |
| |
| /// <summary> |
| /// Fills the <see cref="CharacterBuffer"/> with characters read from the given |
| /// reader <see cref="TextReader"/>. This method tries to read <code>numChars</code> |
| /// characters into the <see cref="CharacterBuffer"/>, each call to fill will start |
| /// filling the buffer from offset <c>0</c> up to <paramref name="numChars"/>. |
| /// In case code points can span across 2 java characters, this method may |
| /// only fill <c>numChars - 1</c> characters in order not to split in |
| /// the middle of a surrogate pair, even if there are remaining characters in |
| /// the <see cref="TextReader"/>. |
| /// <para> |
| /// Depending on the <see cref="LuceneVersion"/> passed to |
| /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method implements |
| /// supplementary character awareness when filling the given buffer. For all |
| /// <see cref="LuceneVersion"/> > 3.0 <see cref="Fill(CharacterBuffer, TextReader, int)"/> guarantees |
| /// that the given <see cref="CharacterBuffer"/> will never contain a high surrogate |
| /// character as the last element in the buffer unless it is the last available |
| /// character in the reader. In other words, high and low surrogate pairs will |
| /// always be preserved across buffer boarders. |
| /// </para> |
| /// <para> |
| /// A return value of <c>false</c> means that this method call exhausted |
| /// the reader, but there may be some bytes which have been read, which can be |
| /// verified by checking whether <c>buffer.Length > 0</c>. |
| /// </para> |
| /// </summary> |
| /// <param name="buffer"> |
| /// the buffer to fill. </param> |
| /// <param name="reader"> |
| /// the reader to read characters from. </param> |
| /// <param name="numChars"> |
| /// the number of chars to read </param> |
| /// <returns> <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer </returns> |
| /// <exception cref="IOException"> |
| /// if the reader throws an <see cref="IOException"/>. </exception> |
| public abstract bool Fill(CharacterBuffer buffer, TextReader reader, int numChars); |
| |
| /// <summary> |
| /// Convenience method which calls <c>Fill(buffer, reader, buffer.Buffer.Length)</c>. </summary> |
| public virtual bool Fill(CharacterBuffer buffer, TextReader reader) |
| { |
| return Fill(buffer, reader, buffer.Buffer.Length); |
| } |
| |
| /// <summary> |
| /// Return the index within <c>buf[start:start+count]</c> which is by <paramref name="offset"/> |
| /// code points from <paramref name="index"/>. |
| /// </summary> |
| public abstract int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset); |
| |
| private static int ReadFully(TextReader reader, char[] dest, int offset, int len) |
| { |
| int read = 0; |
| while (read < len) |
| { |
| int r = reader.Read(dest, offset + read, len - read); |
| if (r <= 0) |
| { |
| break; |
| } |
| read += r; |
| } |
| return read; |
| } |
| |
| private sealed class Java5CharacterUtils : CharacterUtils |
| { |
| public override int CodePointAt(string seq, int offset) |
| { |
| return Character.CodePointAt(seq, offset); |
| } |
| public override int CodePointAt(ICharSequence seq, int offset) |
| { |
| return Character.CodePointAt(seq, offset); |
| } |
| |
| public override int CodePointAt(char[] chars, int offset, int limit) |
| { |
| return Character.CodePointAt(chars, offset, limit); |
| } |
| |
| public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(buffer.Buffer.Length >= 2); |
| if (numChars < 2 || numChars > buffer.Buffer.Length) |
| { |
| throw new ArgumentException("numChars must be >= 2 and <= the buffer size"); |
| } |
| char[] charBuffer = buffer.Buffer; |
| buffer.offset = 0; |
| int offset; |
| |
| // Install the previously saved ending high surrogate: |
| if (buffer.lastTrailingHighSurrogate != 0) |
| { |
| charBuffer[0] = buffer.lastTrailingHighSurrogate; |
| buffer.lastTrailingHighSurrogate = (char)0; |
| offset = 1; |
| } |
| else |
| { |
| offset = 0; |
| } |
| |
| int read = ReadFully(reader, charBuffer, offset, numChars - offset); |
| |
| buffer.length = offset + read; |
| bool result = buffer.length == numChars; |
| if (buffer.length < numChars) |
| { |
| // We failed to fill the buffer. Even if the last char is a high |
| // surrogate, there is nothing we can do |
| return result; |
| } |
| |
| if (char.IsHighSurrogate(charBuffer[buffer.length - 1])) |
| { |
| buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; |
| } |
| return result; |
| } |
| |
| public override int CodePointCount(string seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return Character.CodePointCount(seq, 0, seq.Length); |
| } |
| |
| public override int CodePointCount(ICharSequence seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return Character.CodePointCount(seq, 0, seq.Length); |
| } |
| |
| public override int CodePointCount(char[] seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return Character.CodePointCount(seq, 0, seq.Length); |
| } |
| |
| public override int CodePointCount(StringBuilder seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return Character.CodePointCount(seq, 0, seq.Length); |
| } |
| |
| public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset) |
| { |
| return Character.OffsetByCodePoints(buf, start, count, index, offset); |
| } |
| } |
| |
| // LUCENENET specific - not sealed so we can make another override to handle BW compatibility |
| // with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat() |
| // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests. |
| private class Java4CharacterUtils : CharacterUtils |
| { |
| public override int CodePointAt(string seq, int offset) |
| { |
| // LUCENENET specific - added guard clauses |
| if (seq == null) |
| throw new ArgumentNullException(nameof(seq)); |
| if (offset < 0 || offset >= seq.Length) |
| throw new ArgumentOutOfRangeException(nameof(offset)); |
| |
| return seq[offset]; |
| } |
| |
| public override int CodePointAt(ICharSequence seq, int offset) |
| { |
| // LUCENENET specific - added guard clauses |
| if (seq == null) |
| throw new ArgumentNullException(nameof(seq)); |
| if (offset < 0 || offset >= seq.Length) |
| throw new ArgumentOutOfRangeException(nameof(offset)); |
| |
| return seq[offset]; |
| } |
| |
| public override int CodePointAt(char[] chars, int offset, int limit) |
| { |
| if (chars == null) |
| throw new ArgumentNullException(nameof(chars)); // LUCENENET specific - added for .NET compatibility |
| if (offset >= limit) |
| { |
| throw new ArgumentOutOfRangeException("offset must be less than limit"); |
| } |
| // LUCENENET specific - added array bound check |
| if (offset < 0 || offset >= chars.Length) |
| throw new ArgumentOutOfRangeException(nameof(offset)); |
| |
| return chars[offset]; |
| } |
| |
| public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(buffer.Buffer.Length >= 1); |
| if (numChars < 1 || numChars > buffer.Buffer.Length) |
| { |
| throw new ArgumentException("numChars must be >= 1 and <= the buffer size"); |
| } |
| buffer.offset = 0; |
| int read = ReadFully(reader, buffer.Buffer, 0, numChars); |
| buffer.length = read; |
| buffer.lastTrailingHighSurrogate = (char)0; |
| return read == numChars; |
| } |
| |
| public override int CodePointCount(string seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return seq.Length; |
| } |
| |
| public override int CodePointCount(ICharSequence seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return seq.Length; |
| } |
| |
| public override int CodePointCount(char[] seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return seq.Length; |
| } |
| |
| public override int CodePointCount(StringBuilder seq) |
| { |
| if (seq is null) |
| throw new ArgumentNullException(nameof(seq)); |
| |
| return seq.Length; |
| } |
| |
| public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset) |
| { |
| int result = index + offset; |
| if (result < 0 || result > count) |
| { |
| throw new ArgumentOutOfRangeException(); |
| } |
| return result; |
| } |
| } |
| |
| // LUCENENET specific class to handle BW compatibility |
| // with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat() |
| // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests. This just provides the old (slower) |
| // implementation that represents the original Lucene toUpperCase and toLowerCase methods. |
| private class Java4CharacterUtilsBWCompatibility : Java4CharacterUtils |
| { |
| public override void ToLower(char[] buffer, int offset, int limit) |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(buffer.Length >= limit); |
| Debugging.Assert(offset <= 0 && offset <= buffer.Length); |
| } |
| |
| for (int i = offset; i < limit;) |
| { |
| i += Character.ToChars( |
| Character.ToLower( |
| CodePointAt(buffer, i, limit), CultureInfo.InvariantCulture), buffer, i); |
| } |
| } |
| |
| public override void ToUpper(char[] buffer, int offset, int limit) |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(buffer.Length >= limit); |
| Debugging.Assert(offset <= 0 && offset <= buffer.Length); |
| } |
| |
| for (int i = offset; i < limit;) |
| { |
| i += Character.ToChars( |
| Character.ToUpper( |
| CodePointAt(buffer, i, limit), CultureInfo.InvariantCulture), buffer, i); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// A simple IO buffer to use with |
| /// <see cref="CharacterUtils.Fill(CharacterBuffer, TextReader)"/>. |
| /// </summary> |
| public sealed class CharacterBuffer |
| { |
| private readonly char[] buffer; |
| internal int offset; |
| internal int length; |
| // NOTE: not private so outer class can access without |
| // $access methods: |
| internal char lastTrailingHighSurrogate; |
| |
| internal CharacterBuffer(char[] buffer, int offset, int length) |
| { |
| this.buffer = buffer; |
| this.offset = offset; |
| this.length = length; |
| } |
| |
| /// <summary> |
| /// Returns the internal buffer |
| /// </summary> |
| /// <returns> the buffer </returns> |
| [WritableArray] |
| [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] |
| public char[] Buffer => buffer; |
| |
| /// <summary> |
| /// Returns the data offset in the internal buffer. |
| /// </summary> |
| /// <returns> the offset </returns> |
| public int Offset => offset; |
| |
| /// <summary> |
| /// Return the length of the data in the internal buffer starting at |
| /// <see cref="Offset"/> |
| /// </summary> |
| /// <returns> the length </returns> |
| public int Length => length; |
| |
| /// <summary> |
| /// Resets the CharacterBuffer. All internals are reset to its default |
| /// values. |
| /// </summary> |
| public void Reset() |
| { |
| offset = 0; |
| length = 0; |
| lastTrailingHighSurrogate = (char)0; |
| } |
| } |
| } |
| } |