src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs - lucenenet - Git at Google

 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using System;
 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.IO;

 namespace Lucene.Net.Analysis.Util
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// <see cref="CharacterUtils"/> provides a unified interface to Character-related
     /// operations to implement backwards compatible character operations based on a
     /// <see cref="LuceneVersion"/> instance.
     ///
     /// @lucene.internal
     /// </summary>
     public abstract class CharacterUtils
     {
         // LUCENENET specific class for supporting broken Unicode support in Lucene 3.0.
         // See the TestCharArraySet.TestSupplementaryCharsBWCompat()
         // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests.
         private static readonly CharacterUtils JAVA_4_BW_COMPAT = new Java4CharacterUtilsBWCompatibility();

         private static readonly CharacterUtils JAVA_4 = new Java4CharacterUtils();
         private static readonly CharacterUtils JAVA_5 = new Java5CharacterUtils();

         /// <summary>
         /// Returns a <see cref="CharacterUtils"/> implementation according to the given
         /// <see cref="LuceneVersion"/> instance.
         /// </summary>
         /// <param name="matchVersion">
         ///          a version instance </param>
         /// <returns> a <see cref="CharacterUtils"/> implementation according to the given
         ///         <see cref="LuceneVersion"/> instance. </returns>
         public static CharacterUtils GetInstance(LuceneVersion matchVersion)
         {
 #pragma warning disable 612, 618
             return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)
                 ? JAVA_5
                 : JAVA_4_BW_COMPAT;
 #pragma warning restore 612, 618
         }

         /// <summary>
         /// Return a <see cref="CharacterUtils"/> instance compatible with Java 1.4. </summary>
         public static CharacterUtils GetJava4Instance(LuceneVersion matchVersion) // LUCENENET specific - added matchVersion parameter so we can support backward compatible Unicode support
         {
 #pragma warning disable 612, 618
             return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_4 : JAVA_4_BW_COMPAT;
 #pragma warning restore 612, 618
         }

         /// <summary>
         /// Returns the code point at the given index of the <see cref="ICharSequence"/>.
         /// Depending on the <see cref="LuceneVersion"/> passed to
         /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior
         /// of <c>Character.CodePointAt(char[], int)</c> as it would have been
         /// available on a Java 1.4 JVM or on a later virtual machine version.
         /// </summary>
         /// <param name="seq">
         ///          a character sequence </param>
         /// <param name="offset">
         ///          the offset to the char values in the chars array to be converted
         /// </param>
         /// <returns> the Unicode code point at the given index </returns>
         /// <exception cref="NullReferenceException">
         ///           - if the sequence is null. </exception>
         /// <exception cref="IndexOutOfRangeException">
         ///           - if the value offset is negative or not less than the length of
         ///           the character sequence. </exception>
         public abstract int CodePointAt(string seq, int offset);

         public abstract int CodePointAt(ICharSequence seq, int offset);

         /// <summary>
         /// Returns the code point at the given index of the char array where only elements
         /// with index less than the limit are used.
         /// Depending on the <see cref="LuceneVersion"/> passed to
         /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior
         /// of <c>Character.CodePointAt(char[], int)</c> as it would have been
         /// available on a Java 1.4 JVM or on a later virtual machine version.
         /// </summary>
         /// <param name="chars">
         ///          a character array </param>
         /// <param name="offset">
         ///          the offset to the char values in the chars array to be converted </param>
         /// <param name="limit"> the index afer the last element that should be used to calculate
         ///        codepoint.
         /// </param>
         /// <returns> the Unicode code point at the given index </returns>
         /// <exception cref="NullReferenceException">
         ///           - if the array is null. </exception>
         /// <exception cref="IndexOutOfRangeException">
         ///           - if the value offset is negative or not less than the length of
         ///           the char array. </exception>
         public abstract int CodePointAt(char[] chars, int offset, int limit);

         /// <summary>
         /// Return the number of characters in <paramref name="seq"/>. </summary>
         public abstract int CodePointCount(string seq);

         /// <summary>
         /// Creates a new <see cref="CharacterBuffer"/> and allocates a <see cref="T:char[]"/>
         /// of the given bufferSize.
         /// </summary>
         /// <param name="bufferSize">
         ///          the internal char buffer size, must be <c>&gt;= 2</c> </param>
         /// <returns> a new <see cref="CharacterBuffer"/> instance. </returns>
         public static CharacterBuffer NewCharacterBuffer(int bufferSize)
         {
             if (bufferSize < 2)
             {
                 throw new System.ArgumentException("buffersize must be >= 2");
             }
             return new CharacterBuffer(new char[bufferSize], 0, 0);
         }


         /// <summary>
         /// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to lowercase </param>
         /// <param name="offset"> the offset to start at </param>
         /// <param name="limit"> the max char in the buffer to lower case </param>
         public virtual void ToLower(char[] buffer, int offset, int limit) // LUCENENET specific - marked virtual so we can override the default
         {
             Debug.Assert(buffer.Length >= limit);
             Debug.Assert(offset <= 0 && offset <= buffer.Length);

             // Optimization provided by Vincent Van Den Berghe:
             // http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower
             new string(buffer, offset, limit)
                 .ToLowerInvariant()
                 .CopyTo(0, buffer, offset, limit);

             // Original (slow) Lucene implementation:
             //for (int i = offset; i < limit; )
             //{
             //    i += Character.ToChars(
             //        Character.ToLower(
             //            CodePointAt(buffer, i, limit)), buffer, i);
             //}
         }

         /// <summary>
         /// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to UPPERCASE </param>
         /// <param name="offset"> the offset to start at </param>
         /// <param name="limit"> the max char in the buffer to lower case </param>
         public virtual void ToUpper(char[] buffer, int offset, int limit) // LUCENENET specific - marked virtual so we can override the default
         {
             Debug.Assert(buffer.Length >= limit);
             Debug.Assert(offset <= 0 && offset <= buffer.Length);

             // Optimization provided by Vincent Van Den Berghe:
             // http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower
             new string(buffer, offset, limit)
                 .ToUpperInvariant()
                 .CopyTo(0, buffer, offset, limit);

             // Original (slow) Lucene implementation:
             //for (int i = offset; i < limit; )
             //{
             //    i += Character.ToChars(
             //        Character.ToUpper(
             //            CodePointAt(buffer, i, limit)), buffer, i);
             //}
         }

         /// <summary>
         /// Converts a sequence of .NET characters to a sequence of unicode code points. </summary>
         ///  <returns> the number of code points written to the destination buffer  </returns>
         public int ToCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff)
         {
             if (srcLen < 0)
             {
                 throw new System.ArgumentException("srcLen must be >= 0");
             }
             int codePointCount = 0;
             for (int i = 0; i < srcLen; )
             {
                 int cp = CodePointAt(src, srcOff + i, srcOff + srcLen);
                 int charCount = Character.CharCount(cp);
                 dest[destOff + codePointCount++] = cp;
                 i += charCount;
             }
             return codePointCount;
         }

         /// <summary>
         /// Converts a sequence of unicode code points to a sequence of .NET characters. </summary>
         ///  <returns> the number of chars written to the destination buffer  </returns>
         public int ToChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff)
         {
             if (srcLen < 0)
             {
                 throw new System.ArgumentException("srcLen must be >= 0");
             }
             int written = 0;
             for (int i = 0; i < srcLen; ++i)
             {
                 written += Character.ToChars(src[srcOff + i], dest, destOff + written);
             }
             return written;
         }

         /// <summary>
         /// Fills the <see cref="CharacterBuffer"/> with characters read from the given
         /// reader <see cref="TextReader"/>. This method tries to read <code>numChars</code>
         /// characters into the <see cref="CharacterBuffer"/>, each call to fill will start
         /// filling the buffer from offset <c>0</c> up to <paramref name="numChars"/>.
         /// In case code points can span across 2 java characters, this method may
         /// only fill <c>numChars - 1</c> characters in order not to split in
         /// the middle of a surrogate pair, even if there are remaining characters in
         /// the <see cref="TextReader"/>.
         /// <para>
         /// Depending on the <see cref="LuceneVersion"/> passed to
         /// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method implements
         /// supplementary character awareness when filling the given buffer. For all
         /// <see cref="LuceneVersion"/> &gt; 3.0 <see cref="Fill(CharacterBuffer, TextReader, int)"/> guarantees
         /// that the given <see cref="CharacterBuffer"/> will never contain a high surrogate
         /// character as the last element in the buffer unless it is the last available
         /// character in the reader. In other words, high and low surrogate pairs will
         /// always be preserved across buffer boarders.
         /// </para>
         /// <para>
         /// A return value of <c>false</c> means that this method call exhausted
         /// the reader, but there may be some bytes which have been read, which can be
         /// verified by checking whether <c>buffer.Length &gt; 0</c>.
         /// </para>
         /// </summary>
         /// <param name="buffer">
         ///          the buffer to fill. </param>
         /// <param name="reader">
         ///          the reader to read characters from. </param>
         /// <param name="numChars">
         ///          the number of chars to read </param>
         /// <returns> <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer </returns>
         /// <exception cref="IOException">
         ///           if the reader throws an <see cref="IOException"/>. </exception>
         public abstract bool Fill(CharacterBuffer buffer, TextReader reader, int numChars);

         /// <summary>
         /// Convenience method which calls <c>Fill(buffer, reader, buffer.Buffer.Length)</c>. </summary>
         public virtual bool Fill(CharacterBuffer buffer, TextReader reader)
         {
             return Fill(buffer, reader, buffer.Buffer.Length);
         }

         /// <summary>
         /// Return the index within <c>buf[start:start+count]</c> which is by <paramref name="offset"/>
         /// code points from <paramref name="index"/>.
         /// </summary>
         public abstract int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset);

         private static int ReadFully(TextReader reader, char[] dest, int offset, int len)
         {
             int read = 0;
             while (read < len)
             {
                 int r = reader.Read(dest, offset + read, len - read);
                 if (r <= 0)
                 {
                     break;
                 }
                 read += r;
             }
             return read;
         }

         private sealed class Java5CharacterUtils : CharacterUtils
         {
             public override int CodePointAt(string seq, int offset)
             {
                 return Character.CodePointAt(seq, offset);
             }
             public override int CodePointAt(ICharSequence seq, int offset)
             {
                 return Character.CodePointAt(seq, offset);
             }

             public override int CodePointAt(char[] chars, int offset, int limit)
             {
                 return Character.CodePointAt(chars, offset, limit); // LUCENENET TODO: This will throw a NullReferenceException if chars is null. Should this be an ArgumentNullException in .NET?
             }

             public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars)
             {
                 Debug.Assert(buffer.Buffer.Length >= 2);
                 if (numChars < 2 || numChars > buffer.Buffer.Length)
                 {
                     throw new System.ArgumentException("numChars must be >= 2 and <= the buffer size");
                 }
                 char[] charBuffer = buffer.Buffer;
                 buffer.offset = 0;
                 int offset;

                 // Install the previously saved ending high surrogate:
                 if (buffer.lastTrailingHighSurrogate != 0)
                 {
                     charBuffer[0] = buffer.lastTrailingHighSurrogate;
                     buffer.lastTrailingHighSurrogate = (char)0;
                     offset = 1;
                 }
                 else
                 {
                     offset = 0;
                 }

                 int read = ReadFully(reader, charBuffer, offset, numChars - offset);

                 buffer.length = offset + read;
                 bool result = buffer.length == numChars;
                 if (buffer.length < numChars)
                 {
                     // We failed to fill the buffer. Even if the last char is a high
                     // surrogate, there is nothing we can do
                     return result;
                 }

                 if (char.IsHighSurrogate(charBuffer[buffer.length - 1]))
                 {
                     buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
                 }
                 return result;
             }

             public override int CodePointCount(string seq)
             {
                 return Character.CodePointCount(seq, 0, seq.Length);
             }

             public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
             {
                 return Character.OffsetByCodePoints(buf, start, count, index, offset);
             }
         }

         // LUCENENET specific - not sealed so we can make another override to handle BW compatibility
         // with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat()
         // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests.
         private class Java4CharacterUtils : CharacterUtils
         {
             public override int CodePointAt(string seq, int offset)
             {
                 return seq[offset];
             }

             public override int CodePointAt(ICharSequence seq, int offset)
             {
                 return seq[offset];
             }

             public override int CodePointAt(char[] chars, int offset, int limit)
             {
                 if (offset >= limit)
                 {
                     throw new System.IndexOutOfRangeException("offset must be less than limit");
                 }
                 return chars[offset]; // LUCENENET TODO: This will throw a NullReferenceException if chars is null. Should this be an ArgumentNullException in .NET?
             }

             public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars)
             {
                 Debug.Assert(buffer.Buffer.Length >= 1);
                 if (numChars < 1 || numChars > buffer.Buffer.Length)
                 {
                     throw new System.ArgumentException("numChars must be >= 1 and <= the buffer size");
                 }
                 buffer.offset = 0;
                 int read = ReadFully(reader, buffer.Buffer, 0, numChars);
                 buffer.length = read;
                 buffer.lastTrailingHighSurrogate = (char)0;
                 return read == numChars;
             }

             public override int CodePointCount(string seq)
             {
                 return seq.Length;
             }

             public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
             {
                 int result = index + offset;
                 if (result < 0 || result > count)
                 {
                     throw new System.IndexOutOfRangeException();
                 }
                 return result;
             }
         }

         // LUCENENET specific class to handle BW compatibility
         // with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat()
         // and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests. This just provides the old (slower)
         // implementation that represents the original Lucene toUpperCase and toLowerCase methods.
         private class Java4CharacterUtilsBWCompatibility : Java4CharacterUtils
         {
             public override void ToLower(char[] buffer, int offset, int limit)
             {
                 Debug.Assert(buffer.Length >= limit);
                 Debug.Assert(offset <= 0 && offset <= buffer.Length);

                 for (int i = offset; i < limit;)
                 {
                     i += Character.ToChars(
                         Character.ToLower(
                             CodePointAt(buffer, i, limit)), buffer, i);
                 }
             }

             public override void ToUpper(char[] buffer, int offset, int limit)
             {
                 Debug.Assert(buffer.Length >= limit);
                 Debug.Assert(offset <= 0 && offset <= buffer.Length);

                 for (int i = offset; i < limit;)
                 {
                     i += Character.ToChars(
                         Character.ToUpper(
                             CodePointAt(buffer, i, limit)), buffer, i);
                 }
             }
         }

         /// <summary>
         /// A simple IO buffer to use with
         /// <see cref="CharacterUtils.Fill(CharacterBuffer, TextReader)"/>.
         /// </summary>
         public sealed class CharacterBuffer
         {
             private readonly char[] buffer;
             internal int offset;
             internal int length;
             // NOTE: not private so outer class can access without
             // $access methods:
             internal char lastTrailingHighSurrogate;

             internal CharacterBuffer(char[] buffer, int offset, int length)
             {
                 this.buffer = buffer;
                 this.offset = offset;
                 this.length = length;
             }

             /// <summary>
             /// Returns the internal buffer
             /// </summary>
             /// <returns> the buffer </returns>
             [WritableArray]
             [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")]
             public char[] Buffer
             {
                 get
                 {
                     return buffer;
                 }
             }

             /// <summary>
             /// Returns the data offset in the internal buffer.
             /// </summary>
             /// <returns> the offset </returns>
             public int Offset
             {
                 get
                 {
                     return offset;
                 }
             }

             /// <summary>
             /// Return the length of the data in the internal buffer starting at
             /// <see cref="Offset"/>
             /// </summary>
             /// <returns> the length </returns>
             public int Length
             {
                 get
                 {
                     return length;
                 }
             }

             /// <summary>
             /// Resets the CharacterBuffer. All internals are reset to its default
             /// values.
             /// </summary>
             public void Reset()
             {
                 offset = 0;
                 length = 0;
                 lastTrailingHighSurrogate = (char)0;
             }
         }
     }
 }
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using System;
	using System.Diagnostics;
	using System.Diagnostics.CodeAnalysis;
	using System.IO;

	namespace Lucene.Net.Analysis.Util
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// <see cref="CharacterUtils"/> provides a unified interface to Character-related
	/// operations to implement backwards compatible character operations based on a
	/// <see cref="LuceneVersion"/> instance.
	///
	/// @lucene.internal
	/// </summary>
	public abstract class CharacterUtils
	{
	// LUCENENET specific class for supporting broken Unicode support in Lucene 3.0.
	// See the TestCharArraySet.TestSupplementaryCharsBWCompat()
	// and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests.
	private static readonly CharacterUtils JAVA_4_BW_COMPAT = new Java4CharacterUtilsBWCompatibility();

	private static readonly CharacterUtils JAVA_4 = new Java4CharacterUtils();
	private static readonly CharacterUtils JAVA_5 = new Java5CharacterUtils();

	/// <summary>
	/// Returns a <see cref="CharacterUtils"/> implementation according to the given
	/// <see cref="LuceneVersion"/> instance.
	/// </summary>
	/// <param name="matchVersion">
	/// a version instance </param>
	/// <returns> a <see cref="CharacterUtils"/> implementation according to the given
	/// <see cref="LuceneVersion"/> instance. </returns>
	public static CharacterUtils GetInstance(LuceneVersion matchVersion)
	{
	#pragma warning disable 612, 618
	return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)
	? JAVA_5
	: JAVA_4_BW_COMPAT;
	#pragma warning restore 612, 618
	}

	/// <summary>
	/// Return a <see cref="CharacterUtils"/> instance compatible with Java 1.4. </summary>
	public static CharacterUtils GetJava4Instance(LuceneVersion matchVersion) // LUCENENET specific - added matchVersion parameter so we can support backward compatible Unicode support
	{
	#pragma warning disable 612, 618
	return matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? JAVA_4 : JAVA_4_BW_COMPAT;
	#pragma warning restore 612, 618
	}

	/// <summary>
	/// Returns the code point at the given index of the <see cref="ICharSequence"/>.
	/// Depending on the <see cref="LuceneVersion"/> passed to
	/// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior
	/// of <c>Character.CodePointAt(char[], int)</c> as it would have been
	/// available on a Java 1.4 JVM or on a later virtual machine version.
	/// </summary>
	/// <param name="seq">
	/// a character sequence </param>
	/// <param name="offset">
	/// the offset to the char values in the chars array to be converted
	/// </param>
	/// <returns> the Unicode code point at the given index </returns>
	/// <exception cref="NullReferenceException">
	/// - if the sequence is null. </exception>
	/// <exception cref="IndexOutOfRangeException">
	/// - if the value offset is negative or not less than the length of
	/// the character sequence. </exception>
	public abstract int CodePointAt(string seq, int offset);

	public abstract int CodePointAt(ICharSequence seq, int offset);

	/// <summary>
	/// Returns the code point at the given index of the char array where only elements
	/// with index less than the limit are used.
	/// Depending on the <see cref="LuceneVersion"/> passed to
	/// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method mimics the behavior
	/// of <c>Character.CodePointAt(char[], int)</c> as it would have been
	/// available on a Java 1.4 JVM or on a later virtual machine version.
	/// </summary>
	/// <param name="chars">
	/// a character array </param>
	/// <param name="offset">
	/// the offset to the char values in the chars array to be converted </param>
	/// <param name="limit"> the index afer the last element that should be used to calculate
	/// codepoint.
	/// </param>
	/// <returns> the Unicode code point at the given index </returns>
	/// <exception cref="NullReferenceException">
	/// - if the array is null. </exception>
	/// <exception cref="IndexOutOfRangeException">
	/// - if the value offset is negative or not less than the length of
	/// the char array. </exception>
	public abstract int CodePointAt(char[] chars, int offset, int limit);

	/// <summary>
	/// Return the number of characters in <paramref name="seq"/>. </summary>
	public abstract int CodePointCount(string seq);

	/// <summary>
	/// Creates a new <see cref="CharacterBuffer"/> and allocates a <see cref="T:char[]"/>
	/// of the given bufferSize.
	/// </summary>
	/// <param name="bufferSize">
	/// the internal char buffer size, must be <c>>= 2</c> </param>
	/// <returns> a new <see cref="CharacterBuffer"/> instance. </returns>
	public static CharacterBuffer NewCharacterBuffer(int bufferSize)
	{
	if (bufferSize < 2)
	{
	throw new System.ArgumentException("buffersize must be >= 2");
	}
	return new CharacterBuffer(new char[bufferSize], 0, 0);
	}


	/// <summary>
	/// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting
	/// at the given offset. </summary>
	/// <param name="buffer"> the char buffer to lowercase </param>
	/// <param name="offset"> the offset to start at </param>
	/// <param name="limit"> the max char in the buffer to lower case </param>
	public virtual void ToLower(char[] buffer, int offset, int limit) // LUCENENET specific - marked virtual so we can override the default
	{
	Debug.Assert(buffer.Length >= limit);
	Debug.Assert(offset <= 0 && offset <= buffer.Length);

	// Optimization provided by Vincent Van Den Berghe:
	// http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower
	new string(buffer, offset, limit)
	.ToLowerInvariant()
	.CopyTo(0, buffer, offset, limit);

	// Original (slow) Lucene implementation:
	//for (int i = offset; i < limit; )
	//{
	// i += Character.ToChars(
	// Character.ToLower(
	// CodePointAt(buffer, i, limit)), buffer, i);
	//}
	}

	/// <summary>
	/// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting
	/// at the given offset. </summary>
	/// <param name="buffer"> the char buffer to UPPERCASE </param>
	/// <param name="offset"> the offset to start at </param>
	/// <param name="limit"> the max char in the buffer to lower case </param>
	public virtual void ToUpper(char[] buffer, int offset, int limit) // LUCENENET specific - marked virtual so we can override the default
	{
	Debug.Assert(buffer.Length >= limit);
	Debug.Assert(offset <= 0 && offset <= buffer.Length);

	// Optimization provided by Vincent Van Den Berghe:
	// http://search-lucene.com/m/Lucene.Net/j1zMf1uckOzOYqsi?subj=Proposal+to+speed+up+implementation+of+LowercaseFilter+charUtils+ToLower
	new string(buffer, offset, limit)
	.ToUpperInvariant()
	.CopyTo(0, buffer, offset, limit);

	// Original (slow) Lucene implementation:
	//for (int i = offset; i < limit; )
	//{
	// i += Character.ToChars(
	// Character.ToUpper(
	// CodePointAt(buffer, i, limit)), buffer, i);
	//}
	}

	/// <summary>
	/// Converts a sequence of .NET characters to a sequence of unicode code points. </summary>
	/// <returns> the number of code points written to the destination buffer </returns>
	public int ToCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff)
	{
	if (srcLen < 0)
	{
	throw new System.ArgumentException("srcLen must be >= 0");
	}
	int codePointCount = 0;
	for (int i = 0; i < srcLen; )
	{
	int cp = CodePointAt(src, srcOff + i, srcOff + srcLen);
	int charCount = Character.CharCount(cp);
	dest[destOff + codePointCount++] = cp;
	i += charCount;
	}
	return codePointCount;
	}

	/// <summary>
	/// Converts a sequence of unicode code points to a sequence of .NET characters. </summary>
	/// <returns> the number of chars written to the destination buffer </returns>
	public int ToChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff)
	{
	if (srcLen < 0)
	{
	throw new System.ArgumentException("srcLen must be >= 0");
	}
	int written = 0;
	for (int i = 0; i < srcLen; ++i)
	{
	written += Character.ToChars(src[srcOff + i], dest, destOff + written);
	}
	return written;
	}

	/// <summary>
	/// Fills the <see cref="CharacterBuffer"/> with characters read from the given
	/// reader <see cref="TextReader"/>. This method tries to read <code>numChars</code>
	/// characters into the <see cref="CharacterBuffer"/>, each call to fill will start
	/// filling the buffer from offset <c>0</c> up to <paramref name="numChars"/>.
	/// In case code points can span across 2 java characters, this method may
	/// only fill <c>numChars - 1</c> characters in order not to split in
	/// the middle of a surrogate pair, even if there are remaining characters in
	/// the <see cref="TextReader"/>.
	/// <para>
	/// Depending on the <see cref="LuceneVersion"/> passed to
	/// <see cref="CharacterUtils.GetInstance(LuceneVersion)"/> this method implements
	/// supplementary character awareness when filling the given buffer. For all
	/// <see cref="LuceneVersion"/> > 3.0 <see cref="Fill(CharacterBuffer, TextReader, int)"/> guarantees
	/// that the given <see cref="CharacterBuffer"/> will never contain a high surrogate
	/// character as the last element in the buffer unless it is the last available
	/// character in the reader. In other words, high and low surrogate pairs will
	/// always be preserved across buffer boarders.
	/// </para>
	/// <para>
	/// A return value of <c>false</c> means that this method call exhausted
	/// the reader, but there may be some bytes which have been read, which can be
	/// verified by checking whether <c>buffer.Length > 0</c>.
	/// </para>
	/// </summary>
	/// <param name="buffer">
	/// the buffer to fill. </param>
	/// <param name="reader">
	/// the reader to read characters from. </param>
	/// <param name="numChars">
	/// the number of chars to read </param>
	/// <returns> <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer </returns>
	/// <exception cref="IOException">
	/// if the reader throws an <see cref="IOException"/>. </exception>
	public abstract bool Fill(CharacterBuffer buffer, TextReader reader, int numChars);

	/// <summary>
	/// Convenience method which calls <c>Fill(buffer, reader, buffer.Buffer.Length)</c>. </summary>
	public virtual bool Fill(CharacterBuffer buffer, TextReader reader)
	{
	return Fill(buffer, reader, buffer.Buffer.Length);
	}

	/// <summary>
	/// Return the index within <c>buf[start:start+count]</c> which is by <paramref name="offset"/>
	/// code points from <paramref name="index"/>.
	/// </summary>
	public abstract int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset);

	private static int ReadFully(TextReader reader, char[] dest, int offset, int len)
	{
	int read = 0;
	while (read < len)
	{
	int r = reader.Read(dest, offset + read, len - read);
	if (r <= 0)
	{
	break;
	}
	read += r;
	}
	return read;
	}

	private sealed class Java5CharacterUtils : CharacterUtils
	{
	public override int CodePointAt(string seq, int offset)
	{
	return Character.CodePointAt(seq, offset);
	}
	public override int CodePointAt(ICharSequence seq, int offset)
	{
	return Character.CodePointAt(seq, offset);
	}

	public override int CodePointAt(char[] chars, int offset, int limit)
	{
	return Character.CodePointAt(chars, offset, limit); // LUCENENET TODO: This will throw a NullReferenceException if chars is null. Should this be an ArgumentNullException in .NET?
	}

	public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars)
	{
	Debug.Assert(buffer.Buffer.Length >= 2);
	if (numChars < 2 \|\| numChars > buffer.Buffer.Length)
	{
	throw new System.ArgumentException("numChars must be >= 2 and <= the buffer size");
	}
	char[] charBuffer = buffer.Buffer;
	buffer.offset = 0;
	int offset;

	// Install the previously saved ending high surrogate:
	if (buffer.lastTrailingHighSurrogate != 0)
	{
	charBuffer[0] = buffer.lastTrailingHighSurrogate;
	buffer.lastTrailingHighSurrogate = (char)0;
	offset = 1;
	}
	else
	{
	offset = 0;
	}

	int read = ReadFully(reader, charBuffer, offset, numChars - offset);

	buffer.length = offset + read;
	bool result = buffer.length == numChars;
	if (buffer.length < numChars)
	{
	// We failed to fill the buffer. Even if the last char is a high
	// surrogate, there is nothing we can do
	return result;
	}

	if (char.IsHighSurrogate(charBuffer[buffer.length - 1]))
	{
	buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
	}
	return result;
	}

	public override int CodePointCount(string seq)
	{
	return Character.CodePointCount(seq, 0, seq.Length);
	}

	public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
	{
	return Character.OffsetByCodePoints(buf, start, count, index, offset);
	}
	}

	// LUCENENET specific - not sealed so we can make another override to handle BW compatibility
	// with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat()
	// and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests.
	private class Java4CharacterUtils : CharacterUtils
	{
	public override int CodePointAt(string seq, int offset)
	{
	return seq[offset];
	}

	public override int CodePointAt(ICharSequence seq, int offset)
	{
	return seq[offset];
	}

	public override int CodePointAt(char[] chars, int offset, int limit)
	{
	if (offset >= limit)
	{
	throw new System.IndexOutOfRangeException("offset must be less than limit");
	}
	return chars[offset]; // LUCENENET TODO: This will throw a NullReferenceException if chars is null. Should this be an ArgumentNullException in .NET?
	}

	public override bool Fill(CharacterBuffer buffer, TextReader reader, int numChars)
	{
	Debug.Assert(buffer.Buffer.Length >= 1);
	if (numChars < 1 \|\| numChars > buffer.Buffer.Length)
	{
	throw new System.ArgumentException("numChars must be >= 1 and <= the buffer size");
	}
	buffer.offset = 0;
	int read = ReadFully(reader, buffer.Buffer, 0, numChars);
	buffer.length = read;
	buffer.lastTrailingHighSurrogate = (char)0;
	return read == numChars;
	}

	public override int CodePointCount(string seq)
	{
	return seq.Length;
	}

	public override int OffsetByCodePoints(char[] buf, int start, int count, int index, int offset)
	{
	int result = index + offset;
	if (result < 0 \|\| result > count)
	{
	throw new System.IndexOutOfRangeException();
	}
	return result;
	}
	}

	// LUCENENET specific class to handle BW compatibility
	// with broken unicode support (Lucene 3.0). See the TestCharArraySet.TestSupplementaryCharsBWCompat()
	// and TestCharArraySet.TestSingleHighSurrogateBWComapt() tests. This just provides the old (slower)
	// implementation that represents the original Lucene toUpperCase and toLowerCase methods.
	private class Java4CharacterUtilsBWCompatibility : Java4CharacterUtils
	{
	public override void ToLower(char[] buffer, int offset, int limit)
	{
	Debug.Assert(buffer.Length >= limit);
	Debug.Assert(offset <= 0 && offset <= buffer.Length);

	for (int i = offset; i < limit;)
	{
	i += Character.ToChars(
	Character.ToLower(
	CodePointAt(buffer, i, limit)), buffer, i);
	}
	}

	public override void ToUpper(char[] buffer, int offset, int limit)
	{
	Debug.Assert(buffer.Length >= limit);
	Debug.Assert(offset <= 0 && offset <= buffer.Length);

	for (int i = offset; i < limit;)
	{
	i += Character.ToChars(
	Character.ToUpper(
	CodePointAt(buffer, i, limit)), buffer, i);
	}
	}
	}

	/// <summary>
	/// A simple IO buffer to use with
	/// <see cref="CharacterUtils.Fill(CharacterBuffer, TextReader)"/>.
	/// </summary>
	public sealed class CharacterBuffer
	{
	private readonly char[] buffer;
	internal int offset;
	internal int length;
	// NOTE: not private so outer class can access without
	// $access methods:
	internal char lastTrailingHighSurrogate;

	internal CharacterBuffer(char[] buffer, int offset, int length)
	{
	this.buffer = buffer;
	this.offset = offset;
	this.length = length;
	}

	/// <summary>
	/// Returns the internal buffer
	/// </summary>
	/// <returns> the buffer </returns>
	[WritableArray]
	[SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")]
	public char[] Buffer
	{
	get
	{
	return buffer;
	}
	}

	/// <summary>
	/// Returns the data offset in the internal buffer.
	/// </summary>
	/// <returns> the offset </returns>
	public int Offset
	{
	get
	{
	return offset;
	}
	}

	/// <summary>
	/// Return the length of the data in the internal buffer starting at
	/// <see cref="Offset"/>
	/// </summary>
	/// <returns> the length </returns>
	public int Length
	{
	get
	{
	return length;
	}
	}

	/// <summary>
	/// Resets the CharacterBuffer. All internals are reset to its default
	/// values.
	/// </summary>
	public void Reset()
	{
	offset = 0;
	length = 0;
	lastTrailingHighSurrogate = (char)0;
	}
	}
	}
	}