| package org.apache.lucene.analysis.util; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * {@link CharacterUtils} provides a unified interface to Character-related |
| * operations to implement backwards compatible character operations based on a |
| * {@link Version} instance. |
| * |
| * @lucene.internal |
| */ |
| public abstract class CharacterUtils { |
| private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); |
| private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); |
| |
| /** |
| * Returns a {@link CharacterUtils} implementation according to the given |
| * {@link Version} instance. |
| * |
| * @param matchVersion |
| * a version instance |
| * @return a {@link CharacterUtils} implementation according to the given |
| * {@link Version} instance. |
| */ |
| public static CharacterUtils getInstance(final Version matchVersion) { |
| return JAVA_5; |
| } |
| |
| /** explicitly returns a version matching java 4 semantics */ |
| public static CharacterUtils getJava4Instance() { |
| return JAVA_4; |
| } |
| |
| /** |
| * Returns the code point at the given index of the char array. |
| * Depending on the {@link Version} passed to |
| * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior |
| * of {@link Character#codePointAt(char[], int)} as it would have been |
| * available on a Java 1.4 JVM or on a later virtual machine version. |
| * |
| * @param chars |
| * a character array |
| * @param offset |
| * the offset to the char values in the chars array to be converted |
| * |
| * @return the Unicode code point at the given index |
| * @throws NullPointerException |
| * - if the array is null. |
| * @throws IndexOutOfBoundsException |
| * - if the value offset is negative or not less than the length of |
| * the char array. |
| */ |
| public abstract int codePointAt(final char[] chars, final int offset); |
| |
| /** |
| * Returns the code point at the given index of the {@link CharSequence}. |
| * Depending on the {@link Version} passed to |
| * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior |
| * of {@link Character#codePointAt(char[], int)} as it would have been |
| * available on a Java 1.4 JVM or on a later virtual machine version. |
| * |
| * @param seq |
| * a character sequence |
| * @param offset |
| * the offset to the char values in the chars array to be converted |
| * |
| * @return the Unicode code point at the given index |
| * @throws NullPointerException |
| * - if the sequence is null. |
| * @throws IndexOutOfBoundsException |
| * - if the value offset is negative or not less than the length of |
| * the character sequence. |
| */ |
| public abstract int codePointAt(final CharSequence seq, final int offset); |
| |
| /** |
| * Returns the code point at the given index of the char array where only elements |
| * with index less than the limit are used. |
| * Depending on the {@link Version} passed to |
| * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior |
| * of {@link Character#codePointAt(char[], int)} as it would have been |
| * available on a Java 1.4 JVM or on a later virtual machine version. |
| * |
| * @param chars |
| * a character array |
| * @param offset |
| * the offset to the char values in the chars array to be converted |
| * @param limit the index afer the last element that should be used to calculate |
| * codepoint. |
| * |
| * @return the Unicode code point at the given index |
| * @throws NullPointerException |
| * - if the array is null. |
| * @throws IndexOutOfBoundsException |
| * - if the value offset is negative or not less than the length of |
| * the char array. |
| */ |
| public abstract int codePointAt(final char[] chars, final int offset, final int limit); |
| |
| /** |
| * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code> |
| * of the given bufferSize. |
| * |
| * @param bufferSize |
| * the internal char buffer size, must be <code>>= 2</code> |
| * @return a new {@link CharacterBuffer} instance. |
| */ |
| public static CharacterBuffer newCharacterBuffer(final int bufferSize) { |
| if (bufferSize < 2) { |
| throw new IllegalArgumentException("buffersize must be >= 2"); |
| } |
| return new CharacterBuffer(new char[bufferSize], 0, 0); |
| } |
| |
| |
| /** |
| * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting |
| * at the given offset. |
| * @param buffer the char buffer to lowercase |
| * @param offset the offset to start at |
| * @param limit the max char in the buffer to lower case |
| */ |
| public void toLowerCase(final char[] buffer, final int offset, final int limit) { |
| assert buffer.length >= limit; |
| assert offset <=0 && offset <= buffer.length; |
| for (int i = offset; i < limit;) { |
| i += Character.toChars( |
| Character.toLowerCase( |
| codePointAt(buffer, i)), buffer, i); |
| } |
| } |
| |
| /** |
| * Fills the {@link CharacterBuffer} with characters read from the given |
| * reader {@link Reader}. This method tries to read as many characters into |
| * the {@link CharacterBuffer} as possible, each call to fill will start |
| * filling the buffer from offset <code>0</code> up to the length of the size |
| * of the internal character array. |
| * <p> |
| * Depending on the {@link Version} passed to |
| * {@link CharacterUtils#getInstance(Version)} this method implements |
| * supplementary character awareness when filling the given buffer. For all |
| * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees |
| * that the given {@link CharacterBuffer} will never contain a high surrogate |
| * character as the last element in the buffer unless it is the last available |
| * character in the reader. In other words, high and low surrogate pairs will |
| * always be preserved across buffer boarders. |
| * </p> |
| * |
| * @param buffer |
| * the buffer to fill. |
| * @param reader |
| * the reader to read characters from. |
| * @return <code>true</code> if and only if no more characters are available |
| * in the reader, otherwise <code>false</code>. |
| * @throws IOException |
| * if the reader throws an {@link IOException}. |
| */ |
| public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException; |
| |
| private static final class Java5CharacterUtils extends CharacterUtils { |
| Java5CharacterUtils() { |
| } |
| |
| @Override |
| public int codePointAt(final char[] chars, final int offset) { |
| return Character.codePointAt(chars, offset); |
| } |
| |
| @Override |
| public int codePointAt(final CharSequence seq, final int offset) { |
| return Character.codePointAt(seq, offset); |
| } |
| |
| @Override |
| public int codePointAt(final char[] chars, final int offset, final int limit) { |
| return Character.codePointAt(chars, offset, limit); |
| } |
| |
| @Override |
| public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { |
| final char[] charBuffer = buffer.buffer; |
| buffer.offset = 0; |
| final int offset; |
| |
| // Install the previously saved ending high surrogate: |
| if (buffer.lastTrailingHighSurrogate != 0) { |
| charBuffer[0] = buffer.lastTrailingHighSurrogate; |
| offset = 1; |
| } else { |
| offset = 0; |
| } |
| |
| final int read = reader.read(charBuffer, |
| offset, |
| charBuffer.length - offset); |
| if (read == -1) { |
| buffer.length = offset; |
| buffer.lastTrailingHighSurrogate = 0; |
| return offset != 0; |
| } |
| assert read > 0; |
| buffer.length = read + offset; |
| |
| // If we read only a single char, and that char was a |
| // high surrogate, read again: |
| if (buffer.length == 1 |
| && Character.isHighSurrogate(charBuffer[buffer.length - 1])) { |
| final int read2 = reader.read(charBuffer, |
| 1, |
| charBuffer.length - 1); |
| if (read2 == -1) { |
| // NOTE: mal-formed input (ended on a high |
| // surrogate)! Consumer must deal with it... |
| return true; |
| } |
| assert read2 > 0; |
| |
| buffer.length += read2; |
| } |
| |
| if (buffer.length > 1 |
| && Character.isHighSurrogate(charBuffer[buffer.length - 1])) { |
| buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; |
| } else { |
| buffer.lastTrailingHighSurrogate = 0; |
| } |
| |
| return true; |
| } |
| } |
| |
| private static final class Java4CharacterUtils extends CharacterUtils { |
| Java4CharacterUtils() { |
| } |
| |
| @Override |
| public int codePointAt(final char[] chars, final int offset) { |
| return chars[offset]; |
| } |
| |
| @Override |
| public int codePointAt(final CharSequence seq, final int offset) { |
| return seq.charAt(offset); |
| } |
| |
| @Override |
| public int codePointAt(final char[] chars, final int offset, final int limit) { |
| if(offset >= limit) |
| throw new IndexOutOfBoundsException("offset must be less than limit"); |
| return chars[offset]; |
| } |
| |
| @Override |
| public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { |
| buffer.offset = 0; |
| final int read = reader.read(buffer.buffer); |
| if(read == -1) |
| return false; |
| buffer.length = read; |
| return true; |
| } |
| |
| } |
| |
| /** |
| * A simple IO buffer to use with |
| * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. |
| */ |
| public static final class CharacterBuffer { |
| |
| private final char[] buffer; |
| private int offset; |
| private int length; |
| // NOTE: not private so outer class can access without |
| // $access methods: |
| char lastTrailingHighSurrogate; |
| |
| CharacterBuffer(char[] buffer, int offset, int length) { |
| this.buffer = buffer; |
| this.offset = offset; |
| this.length = length; |
| } |
| |
| /** |
| * Returns the internal buffer |
| * |
| * @return the buffer |
| */ |
| public char[] getBuffer() { |
| return buffer; |
| } |
| |
| /** |
| * Returns the data offset in the internal buffer. |
| * |
| * @return the offset |
| */ |
| public int getOffset() { |
| return offset; |
| } |
| |
| /** |
| * Return the length of the data in the internal buffer starting at |
| * {@link #getOffset()} |
| * |
| * @return the length |
| */ |
| public int getLength() { |
| return length; |
| } |
| |
| /** |
| * Resets the CharacterBuffer. All internals are reset to its default |
| * values. |
| */ |
| public void reset() { |
| offset = 0; |
| length = 0; |
| lastTrailingHighSurrogate = 0; |
| } |
| } |
| |
| } |