| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| |
| /** |
| * Utility class to write tokenizers or token filters. |
| * @lucene.internal |
| */ |
| public final class CharacterUtils { |
| |
| private CharacterUtils() {} // no instantiation |
| |
| /** |
| * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> |
| * of the given bufferSize. |
| * |
| * @param bufferSize |
| * the internal char buffer size, must be <code>>= 2</code> |
| * @return a new {@link CharacterBuffer} instance. |
| */ |
| public static CharacterBuffer newCharacterBuffer(final int bufferSize) { |
| if (bufferSize < 2) { |
| throw new IllegalArgumentException("buffersize must be >= 2"); |
| } |
| return new CharacterBuffer(new char[bufferSize], 0, 0); |
| } |
| |
| |
| /** |
| * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting |
| * at the given offset. |
| * @param buffer the char buffer to lowercase |
| * @param offset the offset to start at |
| * @param limit the max char in the buffer to lower case |
| */ |
| public static void toLowerCase(final char[] buffer, final int offset, final int limit) { |
| assert buffer.length >= limit; |
| assert 0 <= offset && offset <= buffer.length; |
| for (int i = offset; i < limit;) { |
| i += Character.toChars( |
| Character.toLowerCase( |
| Character.codePointAt(buffer, i, limit)), buffer, i); |
| } |
| } |
| |
| /** |
| * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting |
| * at the given offset. |
| * @param buffer the char buffer to UPPERCASE |
| * @param offset the offset to start at |
| * @param limit the max char in the buffer to lower case |
| */ |
| public static void toUpperCase(final char[] buffer, final int offset, final int limit) { |
| assert buffer.length >= limit; |
| assert 0 <= offset && offset <= buffer.length; |
| for (int i = offset; i < limit;) { |
| i += Character.toChars( |
| Character.toUpperCase( |
| Character.codePointAt(buffer, i, limit)), buffer, i); |
| } |
| } |
| |
| /** Converts a sequence of Java characters to a sequence of unicode code points. |
| * @return the number of code points written to the destination buffer */ |
| public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) { |
| if (srcLen < 0) { |
| throw new IllegalArgumentException("srcLen must be >= 0"); |
| } |
| int codePointCount = 0; |
| for (int i = 0; i < srcLen; ) { |
| final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen); |
| final int charCount = Character.charCount(cp); |
| dest[destOff + codePointCount++] = cp; |
| i += charCount; |
| } |
| return codePointCount; |
| } |
| |
| /** Converts a sequence of unicode code points to a sequence of Java characters. |
| * @return the number of chars written to the destination buffer */ |
| public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) { |
| if (srcLen < 0) { |
| throw new IllegalArgumentException("srcLen must be >= 0"); |
| } |
| int written = 0; |
| for (int i = 0; i < srcLen; ++i) { |
| written += Character.toChars(src[srcOff + i], dest, destOff + written); |
| } |
| return written; |
| } |
| |
| /** |
| * Fills the {@link CharacterBuffer} with characters read from the given |
| * reader {@link Reader}. This method tries to read <code>numChars</code> |
| * characters into the {@link CharacterBuffer}, each call to fill will start |
| * filling the buffer from offset <code>0</code> up to <code>numChars</code>. |
| * In case code points can span across 2 java characters, this method may |
| * only fill <code>numChars - 1</code> characters in order not to split in |
| * the middle of a surrogate pair, even if there are remaining characters in |
| * the {@link Reader}. |
| * <p> |
| * This method guarantees |
| * that the given {@link CharacterBuffer} will never contain a high surrogate |
| * character as the last element in the buffer unless it is the last available |
| * character in the reader. In other words, high and low surrogate pairs will |
| * always be preserved across buffer boarders. |
| * </p> |
| * <p> |
| * A return value of <code>false</code> means that this method call exhausted |
| * the reader, but there may be some bytes which have been read, which can be |
| * verified by checking whether <code>buffer.getLength() > 0</code>. |
| * </p> |
| * |
| * @param buffer |
| * the buffer to fill. |
| * @param reader |
| * the reader to read characters from. |
| * @param numChars |
| * the number of chars to read |
| * @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer |
| * @throws IOException |
| * if the reader throws an {@link IOException}. |
| */ |
| public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException { |
| assert buffer.buffer.length >= 2; |
| if (numChars < 2 || numChars > buffer.buffer.length) { |
| throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size"); |
| } |
| final char[] charBuffer = buffer.buffer; |
| buffer.offset = 0; |
| final int offset; |
| |
| // Install the previously saved ending high surrogate: |
| if (buffer.lastTrailingHighSurrogate != 0) { |
| charBuffer[0] = buffer.lastTrailingHighSurrogate; |
| buffer.lastTrailingHighSurrogate = 0; |
| offset = 1; |
| } else { |
| offset = 0; |
| } |
| |
| final int read = readFully(reader, charBuffer, offset, numChars - offset); |
| |
| buffer.length = offset + read; |
| final boolean result = buffer.length == numChars; |
| if (buffer.length < numChars) { |
| // We failed to fill the buffer. Even if the last char is a high |
| // surrogate, there is nothing we can do |
| return result; |
| } |
| |
| if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) { |
| buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; |
| } |
| return result; |
| } |
| |
| /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */ |
| public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException { |
| return fill(buffer, reader, buffer.buffer.length); |
| } |
| |
| static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException { |
| int read = 0; |
| while (read < len) { |
| final int r = reader.read(dest, offset + read, len - read); |
| if (r == -1) { |
| break; |
| } |
| read += r; |
| } |
| return read; |
| } |
| |
| /** |
| * A simple IO buffer to use with |
| * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. |
| */ |
| public static final class CharacterBuffer { |
| |
| private final char[] buffer; |
| private int offset; |
| private int length; |
| // NOTE: not private so outer class can access without |
| // $access methods: |
| char lastTrailingHighSurrogate; |
| |
| CharacterBuffer(char[] buffer, int offset, int length) { |
| this.buffer = buffer; |
| this.offset = offset; |
| this.length = length; |
| } |
| |
| /** |
| * Returns the internal buffer |
| * |
| * @return the buffer |
| */ |
| public char[] getBuffer() { |
| return buffer; |
| } |
| |
| /** |
| * Returns the data offset in the internal buffer. |
| * |
| * @return the offset |
| */ |
| public int getOffset() { |
| return offset; |
| } |
| |
| /** |
| * Return the length of the data in the internal buffer starting at |
| * {@link #getOffset()} |
| * |
| * @return the length |
| */ |
| public int getLength() { |
| return length; |
| } |
| |
| /** |
| * Resets the CharacterBuffer. All internals are reset to its default |
| * values. |
| */ |
| public void reset() { |
| offset = 0; |
| length = 0; |
| lastTrailingHighSurrogate = 0; |
| } |
| } |
| |
| } |