blob: 05110e33826dfed01b98b9372d88bc989ea4ae96 [file] [log] [blame]
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.util;
/**
* UTF operations with more flexibility than is provided by DataInput and
* DataOutput.
*
* @author Mark Hayes
*/
public class UtfOps {
private static byte[] EMPTY_BYTES = {};
private static String EMPTY_STRING = "";
/**
* Returns the byte length of a null terminated UTF string, not including
* the terminator.
*
* @param bytes the data containing the UTF string.
*
* @param offset the beginning of the string the measure.
*
* @throws IndexOutOfBoundsException if no zero terminator is found.
*
* @return the number of bytes.
*/
public static int getZeroTerminatedByteLength(byte[] bytes, int offset)
throws IndexOutOfBoundsException {
int len = 0;
while (bytes[offset++] != 0) {
len++;
}
return len;
}
/**
* Returns the byte length of the UTF string that would be created by
* converting the given characters to UTF.
*
* @param chars the characters that would be converted.
*
* @return the byte length of the equivalent UTF data.
*/
public static int getByteLength(char[] chars) {
return getByteLength(chars, 0, chars.length);
}
/**
* Returns the byte length of the UTF string that would be created by
* converting the given characters to UTF.
*
* @param chars the characters that would be converted.
*
* @param offset the first character to be converted.
*
* @param length the number of characters to be converted.
*
* @return the byte length of the equivalent UTF data.
*/
public static int getByteLength(char[] chars, int offset, int length) {
int len = 0;
length += offset;
for (int i = offset; i < length; i++) {
int c = chars[i];
if ((c >= 0x0001) && (c <= 0x007F)) {
len++;
} else if (c > 0x07FF) {
len += 3;
} else {
len += 2;
}
}
return len;
}
/**
* Returns the number of characters represented by the given UTF string.
*
* @param bytes the UTF string.
*
* @return the number of characters.
*
* @throws IndexOutOfBoundsException if a UTF character sequence at the end
* of the data is not complete.
*
* @throws IllegalArgumentException if an illegal UTF sequence is
* encountered.
*/
public static int getCharLength(byte[] bytes)
throws IllegalArgumentException, IndexOutOfBoundsException {
return getCharLength(bytes, 0, bytes.length);
}
/**
* Returns the number of characters represented by the given UTF string.
*
* @param bytes the data containing the UTF string.
*
* @param offset the first byte to be converted.
*
* @param length the number of byte to be converted.
*
* @return the number of characters.
*
* @throws IndexOutOfBoundsException if a UTF character sequence at the end
* of the data is not complete.
*
* @throws IllegalArgumentException if an illegal UTF sequence is
* encountered.
*/
public static int getCharLength(byte[] bytes, int offset, int length)
throws IllegalArgumentException, IndexOutOfBoundsException {
int charCount = 0;
length += offset;
while (offset < length) {
switch ((bytes[offset] & 0xff) >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
offset++;
break;
case 12: case 13:
offset += 2;
break;
case 14:
offset += 3;
break;
default:
throw new IllegalArgumentException();
}
charCount++;
}
return charCount;
}
/**
* Converts byte arrays into character arrays.
*
* @param bytes the source byte data to convert
*
* @param byteOffset the offset into the byte array at which
* to start the conversion
*
* @param chars the destination array
*
* @param charOffset the offset into chars at which to begin the copy
*
* @param len the amount of information to copy into chars
*
* @param isByteLen if true then len is a measure of bytes, otherwise
* len is a measure of characters
*
* @return the byte offset after converting the bytes.
*
* @throws IndexOutOfBoundsException if a UTF character sequence at the end
* of the data is not complete.
*
* @throws IllegalArgumentException if an illegal UTF sequence is
* encountered.
*/
public static int bytesToChars(byte[] bytes, int byteOffset,
char[] chars, int charOffset,
int len, boolean isByteLen)
throws IllegalArgumentException, IndexOutOfBoundsException {
int char1, char2, char3;
len += isByteLen ? byteOffset : charOffset;
while ((isByteLen ? byteOffset : charOffset) < len) {
char1 = bytes[byteOffset++] & 0xff;
switch ((char1 & 0xff) >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
chars[charOffset++] = (char) char1;
break;
case 12: case 13:
char2 = bytes[byteOffset++];
if ((char2 & 0xC0) != 0x80) {
throw new IllegalArgumentException();
}
chars[charOffset++] = (char)(((char1 & 0x1F) << 6) |
(char2 & 0x3F));
break;
case 14:
char2 = bytes[byteOffset++];
char3 = bytes[byteOffset++];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
throw new IllegalArgumentException();
chars[charOffset++] = (char)(((char1 & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
default:
throw new IllegalArgumentException();
}
}
return byteOffset;
}
/**
* Converts character arrays into byte arrays.
*
* @param chars the source character data to convert
*
* @param charOffset the offset into the character array at which
* to start the conversion
*
* @param bytes the destination array
*
* @param byteOffset the offset into bytes at which to begin the copy
*
* @param charLength the length of characters to copy into bytes
*/
public static void charsToBytes(char[] chars, int charOffset,
byte[] bytes, int byteOffset,
int charLength) {
charLength += charOffset;
for (int i = charOffset; i < charLength; i++) {
int c = chars[i];
if ((c >= 0x0001) && (c <= 0x007F)) {
bytes[byteOffset++] = (byte) c;
} else if (c > 0x07FF) {
bytes[byteOffset++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
bytes[byteOffset++] = (byte) (0x80 | ((c >> 6) & 0x3F));
bytes[byteOffset++] = (byte) (0x80 | ((c >> 0) & 0x3F));
} else {
bytes[byteOffset++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
bytes[byteOffset++] = (byte) (0x80 | ((c >> 0) & 0x3F));
}
}
}
/**
* Converts byte arrays into strings.
*
* @param bytes the source byte data to convert
*
* @param offset the offset into the byte array at which
* to start the conversion
*
* @param length the number of bytes to be converted.
*
* @return the string.
*
* @throws IndexOutOfBoundsException if a UTF character sequence at the end
* of the data is not complete.
*
* @throws IllegalArgumentException if an illegal UTF sequence is
* encountered.
*/
public static String bytesToString(byte[] bytes, int offset, int length)
throws IllegalArgumentException, IndexOutOfBoundsException {
if (length == 0) return EMPTY_STRING;
int charLen = UtfOps.getCharLength(bytes, offset, length);
char[] chars = new char[charLen];
UtfOps.bytesToChars(bytes, offset, chars, 0, length, true);
return new String(chars, 0, charLen);
}
/**
* Converts strings to byte arrays.
*
* @param string the string to convert.
*
* @return the UTF byte array.
*/
public static byte[] stringToBytes(String string) {
if (string.length() == 0) return EMPTY_BYTES;
char[] chars = string.toCharArray();
byte[] bytes = new byte[UtfOps.getByteLength(chars)];
UtfOps.charsToBytes(chars, 0, bytes, 0, chars.length);
return bytes;
}
}