blob: a2bf886c76bcf3182e7d4bd4bced19203c659db9 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Globalization;
using System.Text;
namespace Lucene.Net.Support
{
/// <summary>
/// Mimics Java's Character class.
/// </summary>
public class Character
{
public const int MAX_RADIX = 36;
public const int MIN_RADIX = 2;
public const int MAX_CODE_POINT = 0x10FFFF;
public const int MIN_CODE_POINT = 0x000000;
public const char MAX_SURROGATE = '\uDFFF';
public const char MIN_SURROGATE = '\uD800';
public const char MIN_LOW_SURROGATE = '\uDC00';
public const char MAX_LOW_SURROGATE = '\uDFFF';
public const char MIN_HIGH_SURROGATE = '\uD800';
public const char MAX_HIGH_SURROGATE = '\uDBFF';
public const int MIN_SUPPLEMENTARY_CODE_POINT = 0x010000;
public static int ToChars(int codePoint, char[] dst, int dstIndex)
{
return J2N.Character.ToChars(codePoint, dst, dstIndex);
}
public static char[] ToChars(int codePoint)
{
return J2N.Character.ToChars(codePoint);
}
public static int ToCodePoint(char high, char low)
{
// Optimized form of:
// return ((high - MIN_HIGH_SURROGATE) << 10)
// + (low - MIN_LOW_SURROGATE)
// + MIN_SUPPLEMENTARY_CODE_POINT;
return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
- (MIN_HIGH_SURROGATE << 10)
- MIN_LOW_SURROGATE);
}
public static int CodePointBefore(char[] seq, int index)
{
if (seq == null)
{
throw new ArgumentNullException(nameof(seq));
}
int len = seq.Length;
if (index < 1 || index > len)
{
throw new IndexOutOfRangeException(nameof(index));
}
char low = seq[--index];
if (--index < 0)
{
return low;
}
char high = seq[index];
if (char.IsSurrogatePair(high, low))
{
return ToCodePoint(high, low);
}
return low;
}
public static int ToLower(int codePoint)
{
return J2N.Character.ToLower(codePoint, CultureInfo.InvariantCulture);
}
public static int ToUpper(int codePoint)
{
return J2N.Character.ToUpper(codePoint, CultureInfo.InvariantCulture);
}
public static int CharCount(int codePoint)
{
// A given codepoint can be represented in .NET either by 1 char (up to UTF16),
// or by if it's a UTF32 codepoint, in which case the current char will be a surrogate
return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1;
}
/// <summary>
/// Returns the number of Unicode code points in the text range of the specified char sequence.
/// The text range begins at the specified <paramref name="beginIndex"/> and extends to the char at index <c>endIndex - 1</c>.
/// Thus the length (in <see cref="char"/>s) of the text range is <c>endIndex-beginIndex</c>.
/// Unpaired surrogates within the text range count as one code point each.
/// </summary>
/// <param name="seq">the char sequence</param>
/// <param name="beginIndex">the index to the first char of the text range.</param>
/// <param name="endIndex">the index after the last char of the text range.</param>
/// <returns>the number of Unicode code points in the specified text range</returns>
/// <exception cref="IndexOutOfRangeException">
/// if the <paramref name="beginIndex"/> is negative, or <paramref name="endIndex"/>
/// is larger than the length of the given sequence, or <paramref name="beginIndex"/>
/// is larger than <paramref name="endIndex"/>.
/// </exception>
public static int CodePointCount(string seq, int beginIndex, int endIndex)
{
int length = seq.Length;
if (beginIndex < 0 || endIndex > length || beginIndex > endIndex)
{
throw new IndexOutOfRangeException();
}
int n = endIndex - beginIndex;
for (int i = beginIndex; i < endIndex;)
{
if (char.IsHighSurrogate(seq[i++]) && i < endIndex &&
char.IsLowSurrogate(seq[i]))
{
n--;
i++;
}
}
return n;
}
public static int CodePointCount(char[] a, int offset, int count)
{
if (count > a.Length - offset || offset < 0 || count < 0)
{
throw new IndexOutOfRangeException();
}
return CodePointCountImpl(a, offset, count);
}
internal static int CodePointCountImpl(char[] a, int offset, int count)
{
int endIndex = offset + count;
int n = count;
for (int i = offset; i < endIndex;)
{
if (char.IsHighSurrogate(a[i++]) && i < endIndex
&& char.IsLowSurrogate(a[i]))
{
n--;
i++;
}
}
return n;
}
public static int CodePointAt(string seq, int index)
{
char c1 = seq[index++];
if (char.IsHighSurrogate(c1))
{
if (index < seq.Length)
{
char c2 = seq[index];
if (char.IsLowSurrogate(c2))
{
return ToCodePoint(c1, c2);
}
}
}
return c1;
}
public static int CodePointAt(char high, char low)
{
return ((high << 10) + low) + (MIN_SUPPLEMENTARY_CODE_POINT
- (MIN_HIGH_SURROGATE << 10)
- MIN_LOW_SURROGATE);
}
public static int CodePointAt(StringBuilder seq, int index)
{
char c1 = seq[index++];
if (char.IsHighSurrogate(c1))
{
if (index < seq.Length)
{
char c2 = seq[index];
if (char.IsLowSurrogate(c2))
{
return ToCodePoint(c1, c2);
}
}
}
return c1;
}
public static int CodePointAt(ICharSequence seq, int index)
{
char c1 = seq[index++];
if (char.IsHighSurrogate(c1))
{
if (index < seq.Length)
{
char c2 = seq[index];
if (char.IsLowSurrogate(c2))
{
return ToCodePoint(c1, c2);
}
}
}
return c1;
}
public static int CodePointAt(char[] a, int index, int limit)
{
if (index >= limit || limit < 0 || limit > a.Length)
{
throw new IndexOutOfRangeException();
}
return CodePointAtImpl(a, index, limit);
}
// throws ArrayIndexOutofBoundsException if index out of bounds
static int CodePointAtImpl(char[] a, int index, int limit)
{
char c1 = a[index++];
if (char.IsHighSurrogate(c1))
{
if (index < limit)
{
char c2 = a[index];
if (char.IsLowSurrogate(c2))
{
return ToCodePoint(c1, c2);
}
}
}
return c1;
}
/// <summary>
/// Copy of the implementation from Character class in Java
///
/// http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b27/java/lang/Character.java
/// </summary>
public static int OffsetByCodePoints(string seq, int index,
int codePointOffset)
{
int length = seq.Length;
if (index < 0 || index > length)
{
throw new IndexOutOfRangeException();
}
int x = index;
if (codePointOffset >= 0)
{
int i;
for (i = 0; x < length && i < codePointOffset; i++)
{
if (char.IsHighSurrogate(seq[x++]))
{
if (x < length && char.IsLowSurrogate(seq[x]))
{
x++;
}
}
}
if (i < codePointOffset)
{
throw new IndexOutOfRangeException();
}
}
else
{
int i;
for (i = codePointOffset; x > 0 && i < 0; i++)
{
if (char.IsLowSurrogate(seq[--x]))
{
if (x > 0 && char.IsHighSurrogate(seq[x - 1]))
{
x--;
}
}
}
if (i < 0)
{
throw new IndexOutOfRangeException();
}
}
return x;
}
/// <summary>
/// Copy of the implementation from Character class in Java
///
/// http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b27/java/lang/Character.java
/// </summary>
public static int OffsetByCodePoints(char[] a, int start, int count,
int index, int codePointOffset)
{
if (count > a.Length - start || start < 0 || count < 0
|| index < start || index > start + count)
{
throw new IndexOutOfRangeException();
}
return OffsetByCodePointsImpl(a, start, count, index, codePointOffset);
}
static int OffsetByCodePointsImpl(char[] a, int start, int count,
int index, int codePointOffset)
{
int x = index;
if (codePointOffset >= 0)
{
int limit = start + count;
int i;
for (i = 0; x < limit && i < codePointOffset; i++)
{
if (Char.IsHighSurrogate(a[x++]) && x < limit && Char.IsLowSurrogate(a[x]))
{
x++;
}
}
if (i < codePointOffset)
{
throw new IndexOutOfRangeException();
}
}
else
{
int i;
for (i = codePointOffset; x > start && i < 0; i++)
{
if (Char.IsLowSurrogate(a[--x]) && x > start &&
Char.IsHighSurrogate(a[x - 1]))
{
x--;
}
}
if (i < 0)
{
throw new IndexOutOfRangeException();
}
}
return x;
}
public static bool IsLetter(int c)
{
return J2N.Character.IsLetter(c);
}
/// <summary>
/// LUCENENET safe way to get unicode category. The .NET <see cref="char.ConvertFromUtf32(int)"/>
/// method should be used first to be safe for surrogate pairs. However, if the value falls between
/// 0x00d800 and 0x00dfff, that method throws an exception. So this is a wrapper that converts the
/// codepoint to a char in those cases.
///
/// This mimics the behavior of the Java Character.GetType class, but returns the .NET UnicodeCategory
/// enumeration for easy consumption.
/// </summary>
/// <param name="codePoint"></param>
/// <returns> A <see cref="UnicodeCategory"/> representing the <paramref name="codePoint"/>. </returns>
public static UnicodeCategory GetType(int codePoint)
{
return J2N.Character.GetType(codePoint);
}
}
}