blob: 9ed077c96f206a54303a3bbd6d6b21da215fdc36 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
/**
* Utility class to write tokenizers or token filters.
* @lucene.internal
*/
public final class CharacterUtils {
private CharacterUtils() {} // no instantiation
/**
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
* of the given bufferSize.
*
* @param bufferSize
* the internal char buffer size, must be <code>&gt;= 2</code>
* @return a new {@link CharacterBuffer} instance.
*/
public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
if (bufferSize < 2) {
throw new IllegalArgumentException("buffersize must be >= 2");
}
return new CharacterBuffer(new char[bufferSize], 0, 0);
}
/**
* Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting
* at the given offset.
* @param buffer the char buffer to lowercase
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert 0 <= offset && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
/**
* Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
* at the given offset.
* @param buffer the char buffer to UPPERCASE
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert 0 <= offset && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toUpperCase(
Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
/** Converts a sequence of Java characters to a sequence of unicode code points.
* @return the number of code points written to the destination buffer */
public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
int codePointCount = 0;
for (int i = 0; i < srcLen; ) {
final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
final int charCount = Character.charCount(cp);
dest[destOff + codePointCount++] = cp;
i += charCount;
}
return codePointCount;
}
/** Converts a sequence of unicode code points to a sequence of Java characters.
* @return the number of chars written to the destination buffer */
public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
int written = 0;
for (int i = 0; i < srcLen; ++i) {
written += Character.toChars(src[srcOff + i], dest, destOff + written);
}
return written;
}
/**
* Fills the {@link CharacterBuffer} with characters read from the given
* reader {@link Reader}. This method tries to read <code>numChars</code>
* characters into the {@link CharacterBuffer}, each call to fill will start
* filling the buffer from offset <code>0</code> up to <code>numChars</code>.
* In case code points can span across 2 java characters, this method may
* only fill <code>numChars - 1</code> characters in order not to split in
* the middle of a surrogate pair, even if there are remaining characters in
* the {@link Reader}.
* <p>
* This method guarantees
* that the given {@link CharacterBuffer} will never contain a high surrogate
* character as the last element in the buffer unless it is the last available
* character in the reader. In other words, high and low surrogate pairs will
* always be preserved across buffer boarders.
* </p>
* <p>
* A return value of <code>false</code> means that this method call exhausted
* the reader, but there may be some bytes which have been read, which can be
* verified by checking whether <code>buffer.getLength() &gt; 0</code>.
* </p>
*
* @param buffer
* the buffer to fill.
* @param reader
* the reader to read characters from.
* @param numChars
* the number of chars to read
* @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
* @throws IOException
* if the reader throws an {@link IOException}.
*/
public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
assert buffer.buffer.length >= 2;
if (numChars < 2 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
}
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
final int offset;
// Install the previously saved ending high surrogate:
if (buffer.lastTrailingHighSurrogate != 0) {
charBuffer[0] = buffer.lastTrailingHighSurrogate;
buffer.lastTrailingHighSurrogate = 0;
offset = 1;
} else {
offset = 0;
}
final int read = readFully(reader, charBuffer, offset, numChars - offset);
buffer.length = offset + read;
final boolean result = buffer.length == numChars;
if (buffer.length < numChars) {
// We failed to fill the buffer. Even if the last char is a high
// surrogate, there is nothing we can do
return result;
}
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
}
return result;
}
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
return fill(buffer, reader, buffer.buffer.length);
}
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
int read = 0;
while (read < len) {
final int r = reader.read(dest, offset + read, len - read);
if (r == -1) {
break;
}
read += r;
}
return read;
}
/**
* A simple IO buffer to use with
* {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
*/
public static final class CharacterBuffer {
private final char[] buffer;
private int offset;
private int length;
// NOTE: not private so outer class can access without
// $access methods:
char lastTrailingHighSurrogate;
CharacterBuffer(char[] buffer, int offset, int length) {
this.buffer = buffer;
this.offset = offset;
this.length = length;
}
/**
* Returns the internal buffer
*
* @return the buffer
*/
public char[] getBuffer() {
return buffer;
}
/**
* Returns the data offset in the internal buffer.
*
* @return the offset
*/
public int getOffset() {
return offset;
}
/**
* Return the length of the data in the internal buffer starting at
* {@link #getOffset()}
*
* @return the length
*/
public int getLength() {
return length;
}
/**
* Resets the CharacterBuffer. All internals are reset to its default
* values.
*/
public void reset() {
offset = 0;
length = 0;
lastTrailingHighSurrogate = 0;
}
}
}