blob: dc8901a3a26fd8744622655bd0162b98b197775d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package org.apache.directory.api.util;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
/**
* Various unicode manipulation methods that are more efficient then chaining
* operations: all is done in the same buffer without creating a bunch of string
* objects.
*
* @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
*/
public final class Unicode
{
private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
private static final int UTF8_TWO_BYTES = 0x00C0;
private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
private static final int UTF8_THREE_BYTES = 0x00E0;
private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
private static final int UTF8_FOUR_BYTES = 0x00F0;
private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
private static final int UTF8_FIVE_BYTES = 0x00F8;
private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
private static final int UTF8_SIX_BYTES = 0x00FC;
/** %01-%27 %2B-%5B %5D-%7F */
private static final boolean[] UNICODE_SUBSET =
{
// '\0'
false, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
// '(', ')', '*'
false, false, false, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
// '\'
true, true, true, true, false, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
};
private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;
private Unicode()
{
}
/**
* Count the number of bytes needed to return an Unicode char. This can be
* from 1 to 6.
*
* @param bytes The bytes to read
* @param pos Position to start counting. It must be a valid start of a
* encoded char !
* @return The number of bytes to create a char, or -1 if the encoding is
* wrong. TODO : Should stop after the third byte, as a char is only
* 2 bytes long.
*/
public static int countBytesPerChar( byte[] bytes, int pos )
{
if ( bytes == null )
{
return -1;
}
if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
{
return 1;
}
else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
{
return 2;
}
else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
{
return 3;
}
else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
{
return 4;
}
else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
{
return 5;
}
else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
{
return 6;
}
else
{
return -1;
}
}
/**
* Return the Unicode char which is coded in the bytes at position 0.
*
* @param bytes The byte[] represntation of an Unicode string.
* @return The first char found.
*/
public static char bytesToChar( byte[] bytes )
{
return bytesToChar( bytes, 0 );
}
/**
* Return the Unicode char which is coded in the bytes at the given
* position.
*
* @param bytes The byte[] represntation of an Unicode string.
* @param pos The current position to start decoding the char
* @return The decoded char, or -1 if no char can be decoded TODO : Should
* stop after the third byte, as a char is only 2 bytes long.
*/
public static char bytesToChar( byte[] bytes, int pos )
{
if ( bytes == null )
{
return ( char ) -1;
}
if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
{
return ( char ) bytes[pos];
}
else
{
if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
{
// Two bytes char
// 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
}
else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
{
// Three bytes char
// 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
+ ( ( bytes[pos + 1] & 0x3C ) << 6 )
+ ( ( bytes[pos + 1] & 0x03 ) << 6 )
+ ( bytes[pos + 2] & 0x3F )
);
}
else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
{
// Four bytes char
return ( char ) (
// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
( ( bytes[pos] & 0x07 ) << 18 )
+ ( ( bytes[pos + 1] & 0x30 ) << 16 )
+ ( ( bytes[pos + 1] & 0x0F ) << 12 )
+ ( ( bytes[pos + 2] & 0x3C ) << 6 )
+ ( ( bytes[pos + 2] & 0x03 ) << 6 )
+ ( bytes[pos + 3] & 0x3F )
);
}
else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
{
// Five bytes char
return ( char ) (
// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
// 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
( ( bytes[pos] & 0x03 ) << 24 )
+ ( ( bytes[pos + 1] & 0x3F ) << 18 )
+ ( ( bytes[pos + 2] & 0x30 ) << 12 )
+ ( ( bytes[pos + 2] & 0x0F ) << 12 )
+ ( ( bytes[pos + 3] & 0x3C ) << 6 )
+ ( ( bytes[pos + 3] & 0x03 ) << 6 )
+ ( bytes[pos + 4] & 0x3F )
);
}
else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
{
// Six bytes char
return ( char ) (
// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
// -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
( ( bytes[pos] & 0x01 ) << 30 )
+ ( ( bytes[pos + 1] & 0x3F ) << 24 )
+ ( ( bytes[pos + 2] & 0x3F ) << 18 )
+ ( ( bytes[pos + 3] & 0x30 ) << 12 )
+ ( ( bytes[pos + 3] & 0x0F ) << 12 )
+ ( ( bytes[pos + 4] & 0x3C ) << 6 )
+ ( ( bytes[pos + 4] & 0x03 ) << 6 )
+ ( bytes[pos + 5] & 0x3F )
);
}
else
{
return ( char ) -1;
}
}
}
/**
* Return the number of bytes that hold an Unicode char.
*
* @param car The character to be decoded
* @return The number of bytes to hold the char. TODO : Should stop after
* the third byte, as a char is only 2 bytes long.
*/
public static int countNbBytesPerChar( char car )
{
if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
{
return 1;
}
else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
{
return 2;
}
else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
{
return 3;
}
else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
{
return 4;
}
else
{
return -1;
}
}
/**
* Count the number of bytes included in the given char[].
*
* @param chars The char array to decode
* @return The number of bytes in the char array
*/
public static int countBytes( char[] chars )
{
if ( chars == null )
{
return 0;
}
int nbBytes = 0;
int currentPos = 0;
while ( currentPos < chars.length )
{
int nbb = countNbBytesPerChar( chars[currentPos] );
// If the number of bytes necessary to encode a character is
// above 3, we will need two UTF-16 chars
currentPos += ( nbb < 4 ? 1 : 2 );
nbBytes += nbb;
}
return nbBytes;
}
/**
* Count the number of chars included in the given byte[].
*
* @param bytes The byte array to decode
* @return The number of char in the byte array
*/
public static int countChars( byte[] bytes )
{
if ( bytes == null )
{
return 0;
}
int nbChars = 0;
int currentPos = 0;
while ( currentPos < bytes.length )
{
currentPos += countBytesPerChar( bytes, currentPos );
nbChars++;
}
return nbChars;
}
/**
* Return the Unicode char which is coded in the bytes at the given
* position.
*
* @param car The character to be transformed to an array of bytes
*
* @return The byte array representing the char
*
* TODO : Should stop after the third byte, as a char is only 2 bytes long.
*/
public static byte[] charToBytes( char car )
{
if ( car <= 0x007F )
{
byte[] bytes = new byte[1];
// Single byte char
bytes[0] = ( byte ) car;
return bytes;
}
else if ( car <= 0x07FF )
{
byte[] bytes = new byte[2];
// two bytes char
bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
return bytes;
}
else
{
byte[] bytes = new byte[3];
// Three bytes char
bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
return bytes;
}
}
/**
* Check if the current char is in the unicodeSubset : all chars but
* '\0', '(', ')', '*' and '\'
*
* @param str The string to check
* @param pos Position of the current char
* @return True if the current char is in the unicode subset
*/
public static boolean isUnicodeSubset( String str, int pos )
{
if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
{
return false;
}
char c = str.charAt( pos );
return ( c > 127 ) || UNICODE_SUBSET[c];
}
/**
* Check if the current char is in the unicodeSubset : all chars but
* '\0', '(', ')', '*' and '\'
*
* @param c The char to check
* @return True if the current char is in the unicode subset
*/
public static boolean isUnicodeSubset( char c )
{
return ( c > 127 ) || UNICODE_SUBSET[c];
}
/**
* Check if the current byte is in the unicodeSubset : all chars but
* '\0', '(', ')', '*' and '\'
*
* @param b The byte to check
* @return True if the current byte is in the unicode subset
*/
public static boolean isUnicodeSubset( byte b )
{
return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
}
/**
*
* Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
* of every character in the string str. If str is null, the string value 'null' is written with a length of 0
* instead of throwing an NullPointerException. Each character in the string s is converted to a group of one,
* two, or three bytes, depending on the value of the character.
*
* Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
* written in the length information (four bytes (writeInt)) and the string is split into smaller parts
* if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
* can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
* once.
*
* See also {@link java.io.DataOutput#writeUTF(String)}.
*
* @param objectOutput The objectOutput to write to
* @param str The value to write
* @throws java.io.IOException If the value can't be written to the file
*/
public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
{
// Write a 'null' string
if ( str == null )
{
objectOutput.writeInt( 0 );
objectOutput.writeUTF( "null" );
}
else
{
// Write length of string
objectOutput.writeInt( str.length() );
StringBuilder strBuf = new StringBuilder( str );
// Write the string in portions not larger than 21845 characters
while ( strBuf != null )
{
if ( strBuf.length() < 21845 )
{
objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
strBuf = null;
}
else
{
objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
strBuf.delete( 0, 21845 );
}
}
}
}
/**
*
* Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is
* that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
* characters is then returned as a String.
*
* First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
* of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of
* additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
* length of each group is computed from the value of the first byte of the group. The byte following a group, if
* any, is the first byte of the next group.
*
*See also {@link java.io.DataInput#readUTF()}.
*
* @param objectInput The objectInput to read from
* @return The read string
* @throws java.io.IOException If the value can't be read
*/
public static String readUTF( ObjectInput objectInput ) throws IOException
{
// Read length of the string
int strLength = objectInput.readInt();
// Start reading the string
StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );
if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) )
{
// The special case of a 'null' string
return null;
}
else
{
while ( strLength > strBuf.length() )
{
strBuf.append( objectInput.readUTF() );
}
return strBuf.toString();
}
}
}