| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| */ |
| package org.apache.directory.shared.util; |
| |
| |
| import java.io.IOException; |
| import java.io.ObjectInput; |
| import java.io.ObjectOutput; |
| |
| /** |
| * Various unicode manipulation methods that are more efficient then chaining |
| * operations: all is done in the same buffer without creating a bunch of string |
| * objects. |
| * |
| * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a> |
| */ |
| public final class Unicode |
| { |
| /** |
| * Count the number of bytes needed to return an Unicode char. This can be |
| * from 1 to 6. |
| * |
| * @param bytes The bytes to read |
| * @param pos Position to start counting. It must be a valid start of a |
| * encoded char ! |
| * @return The number of bytes to create a char, or -1 if the encoding is |
| * wrong. TODO : Should stop after the third byte, as a char is only |
| * 2 bytes long. |
| */ |
| public static int countBytesPerChar( byte[] bytes, int pos ) |
| { |
| if ( bytes == null ) |
| { |
| return -1; |
| } |
| |
| if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) |
| { |
| return 1; |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) |
| { |
| return 2; |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) |
| { |
| return 3; |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) |
| { |
| return 4; |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) |
| { |
| return 5; |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES ) |
| { |
| return 6; |
| } |
| else |
| { |
| return -1; |
| } |
| } |
| |
| /** |
| * Return the Unicode char which is coded in the bytes at position 0. |
| * |
| * @param bytes The byte[] represntation of an Unicode string. |
| * @return The first char found. |
| */ |
| public static char bytesToChar( byte[] bytes ) |
| { |
| return bytesToChar( bytes, 0 ); |
| } |
| |
| /** |
| * Return the Unicode char which is coded in the bytes at the given |
| * position. |
| * |
| * @param bytes The byte[] represntation of an Unicode string. |
| * @param pos The current position to start decoding the char |
| * @return The decoded char, or -1 if no char can be decoded TODO : Should |
| * stop after the third byte, as a char is only 2 bytes long. |
| */ |
| public static char bytesToChar( byte[] bytes, int pos ) |
| { |
| if ( bytes == null ) |
| { |
| return ( char ) -1; |
| } |
| |
| if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) |
| { |
| return ( char ) bytes[pos]; |
| } |
| else |
| { |
| if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) |
| { |
| // Two bytes char |
| return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy |
| // 10zz-zzzz |
| // -> |
| // 0000-0xxx |
| // 0000-0000 |
| ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz |
| // -> 0000-0000 |
| // yy00-0000 |
| ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000 |
| // 00zz-zzzz |
| ); // -> 0000-0xxx yyzz-zzzz (07FF) |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) |
| { |
| // Three bytes char |
| return ( char ) ( |
| // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000 |
| ( ( bytes[pos] & 0x0F ) << 12 ) |
| // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000 |
| + ( ( bytes[pos + 1] & 0x3C ) << 6 ) |
| // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000 |
| + ( ( bytes[pos + 1] & 0x03 ) << 6 ) |
| // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz |
| + ( bytes[pos + 2] & 0x3F ) |
| // -> tttt-xxxx yyzz-zzzz (FF FF) |
| ); |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) |
| { |
| // Four bytes char |
| return ( char ) ( |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 |
| // 0000-0000 0000-0000 |
| ( ( bytes[pos] & 0x07 ) << 18 ) |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu |
| // 0000-0000 0000-0000 |
| + ( ( bytes[pos + 1] & 0x30 ) << 16 ) |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 |
| // vvvv-0000 0000-0000 |
| + ( ( bytes[pos + 1] & 0x0F ) << 12 ) |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 |
| // 0000-xxxx 0000-0000 |
| + ( ( bytes[pos + 2] & 0x3C ) << 6 ) |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 |
| // 0000-0000 yy00-0000 |
| + ( ( bytes[pos + 2] & 0x03 ) << 6 ) |
| // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 |
| // 0000-0000 00zz-zzzz |
| + ( bytes[pos + 3] & 0x3F ) |
| // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) |
| ); |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) |
| { |
| // Five bytes char |
| return ( char ) ( |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-00tt 0000-0000 0000-0000 0000-0000 |
| ( ( bytes[pos] & 0x03 ) << 24 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 uuuu-uu00 0000-0000 0000-0000 |
| + ( ( bytes[pos + 1] & 0x3F ) << 18 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 0000-00vv 0000-0000 0000-0000 |
| + ( ( bytes[pos + 2] & 0x30 ) << 12 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 0000-0000 wwww-0000 0000-0000 |
| + ( ( bytes[pos + 2] & 0x0F ) << 12 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 0000-0000 0000-xxxx 0000-0000 |
| + ( ( bytes[pos + 3] & 0x3C ) << 6 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 0000-0000 0000-0000 yy00-0000 |
| + ( ( bytes[pos + 3] & 0x03 ) << 6 ) |
| // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> |
| // 0000-0000 0000-0000 0000-0000 00zz-zzzz |
| + ( bytes[pos + 4] & 0x3F ) |
| // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) |
| ); |
| } |
| else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) |
| { |
| // Six bytes char |
| return ( char ) ( |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz |
| // -> |
| // 0s00-0000 0000-0000 0000-0000 0000-0000 |
| ( ( bytes[pos] & 0x01 ) << 30 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz |
| // -> |
| // 00tt-tttt 0000-0000 0000-0000 0000-0000 |
| + ( ( bytes[pos + 1] & 0x3F ) << 24 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy |
| // 10zz-zzzz -> |
| // 0000-0000 uuuu-uu00 0000-0000 0000-0000 |
| + ( ( bytes[pos + 2] & 0x3F ) << 18 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy |
| // 10zz-zzzz -> |
| // 0000-0000 0000-00vv 0000-0000 0000-0000 |
| + ( ( bytes[pos + 3] & 0x30 ) << 12 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy |
| // 10zz-zzzz -> |
| // 0000-0000 0000-0000 wwww-0000 0000-0000 |
| + ( ( bytes[pos + 3] & 0x0F ) << 12 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy |
| // 10zz-zzzz -> |
| // 0000-0000 0000-0000 0000-xxxx 0000-0000 |
| + ( ( bytes[pos + 4] & 0x3C ) << 6 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy |
| // 10zz-zzzz -> |
| // 0000-0000 0000-0000 0000-0000 yy00-0000 |
| + ( ( bytes[pos + 4] & 0x03 ) << 6 ) |
| // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz |
| // -> |
| // 0000-0000 0000-0000 0000-0000 00zz-zzzz |
| + ( bytes[pos + 5] & 0x3F ) |
| // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) |
| ); |
| } |
| else |
| { |
| return ( char ) -1; |
| } |
| } |
| } |
| |
| /** |
| * Return the number of bytes that hold an Unicode char. |
| * |
| * @param car The character to be decoded |
| * @return The number of bytes to hold the char. TODO : Should stop after |
| * the third byte, as a char is only 2 bytes long. |
| */ |
| public static int countNbBytesPerChar( char car ) |
| { |
| if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 ) |
| { |
| return 1; |
| } |
| else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 ) |
| { |
| return 2; |
| } |
| else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 ) |
| { |
| return 3; |
| } |
| else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 ) |
| { |
| return 4; |
| } |
| else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 ) |
| { |
| return 5; |
| } |
| else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 ) |
| { |
| return 6; |
| } |
| else |
| { |
| return -1; |
| } |
| } |
| |
| /** |
| * Count the number of bytes included in the given char[]. |
| * |
| * @param chars The char array to decode |
| * @return The number of bytes in the char array |
| */ |
| public static int countBytes( char[] chars ) |
| { |
| if ( chars == null ) |
| { |
| return 0; |
| } |
| |
| int nbBytes = 0; |
| int currentPos = 0; |
| |
| while ( currentPos < chars.length ) |
| { |
| int nbb = countNbBytesPerChar( chars[currentPos] ); |
| |
| // If the number of bytes necessary to encode a character is |
| // above 3, we will need two UTF-16 chars |
| currentPos += ( nbb < 4 ? 1 : 2 ); |
| nbBytes += nbb; |
| } |
| |
| return nbBytes; |
| } |
| |
| /** |
| * Count the number of chars included in the given byte[]. |
| * |
| * @param bytes The byte array to decode |
| * @return The number of char in the byte array |
| */ |
| public static int countChars( byte[] bytes ) |
| { |
| if ( bytes == null ) |
| { |
| return 0; |
| } |
| |
| int nbChars = 0; |
| int currentPos = 0; |
| |
| while ( currentPos < bytes.length ) |
| { |
| currentPos += countBytesPerChar(bytes, currentPos); |
| nbChars++; |
| } |
| |
| return nbChars; |
| } |
| |
| /** |
| * Return the Unicode char which is coded in the bytes at the given |
| * position. |
| * |
| * @param car The character to be transformed to an array of bytes |
| * |
| * @return The byte array representing the char |
| * |
| * TODO : Should stop after the third byte, as a char is only 2 bytes long. |
| */ |
| public static byte[] charToBytes( char car ) |
| { |
| byte[] bytes = new byte[countNbBytesPerChar(car)]; |
| |
| if ( car <= 0x7F ) |
| { |
| // Single byte char |
| bytes[0] = ( byte ) car; |
| return bytes; |
| } |
| else if ( car <= 0x7FF ) |
| { |
| // two bytes char |
| bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); |
| bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); |
| } |
| else |
| { |
| // Three bytes char |
| bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); |
| bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); |
| bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); |
| } |
| |
| return bytes; |
| } |
| |
| /** |
| * Check if the current char is in the unicodeSubset : all chars but |
| * '\0', '(', ')', '*' and '\' |
| * |
| * @param str The string to check |
| * @param pos Position of the current char |
| * @return True if the current char is in the unicode subset |
| */ |
| public static boolean isUnicodeSubset( String str, int pos ) |
| { |
| if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) ) |
| { |
| return false; |
| } |
| |
| char c = str.charAt( pos ); |
| |
| return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); |
| } |
| |
| /** |
| * Check if the current char is in the unicodeSubset : all chars but |
| * '\0', '(', ')', '*' and '\' |
| * |
| * @param c The char to check |
| * @return True if the current char is in the unicode subset |
| */ |
| public static boolean isUnicodeSubset( char c ) |
| { |
| return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); |
| } |
| |
| /** |
| * |
| * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation |
| * of every character in the string str. If str is null, the string value 'null' is written with a length of 0 |
| * instead of throwing an NullPointerException. Each character in the string s is converted to a group of one, |
| * two, or three bytes, depending on the value of the character. |
| * |
| * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is |
| * written in the length information (four bytes (writeInt)) and the string is split into smaller parts |
| * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes |
| * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at |
| * once. |
| * |
| * See also {@link java.io.DataOutput#writeUTF(String)}. |
| * |
| * @param objectOutput The objectOutput to write to |
| * @param str The value to write |
| * @throws java.io.IOException If the value can't be written to the file |
| */ |
| public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException |
| { |
| // Write a 'null' string |
| if ( str == null ) |
| { |
| objectOutput.writeInt( 0 ); |
| objectOutput.writeUTF( "null" ); |
| } |
| else |
| { |
| // Write length of string |
| objectOutput.writeInt( str.length() ); |
| |
| StringBuffer strBuf = new StringBuffer( str ); |
| |
| // Write the string in portions not larger than 21845 characters |
| while ( strBuf != null ) |
| { |
| if ( strBuf.length() < 21845 ) |
| { |
| objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) ); |
| strBuf = null; |
| } |
| else |
| { |
| objectOutput.writeUTF( strBuf.substring( 0, 21845 ) ); |
| strBuf.delete( 0, 21845 ); |
| } |
| } |
| } |
| } |
| |
| /** |
| * |
| * Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is |
| * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of |
| * characters is then returned as a String. |
| * |
| * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner |
| * of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of |
| * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The |
| * length of each group is computed from the value of the first byte of the group. The byte following a group, if |
| * any, is the first byte of the next group. |
| * |
| *See also {@link java.io.DataInput#readUTF()}. |
| * |
| * @param objectInput The objectInput to read from |
| * @return The read string |
| * @throws java.io.IOException If the value can't be read |
| */ |
| public static String readUTF( ObjectInput objectInput ) throws IOException |
| { |
| StringBuffer strBuf = null; |
| |
| // Read length of the string |
| int strLength = objectInput.readInt(); |
| |
| // Start reading the string |
| strBuf = new StringBuffer( objectInput.readUTF() ); |
| |
| if ( strLength == 0 && strBuf.toString().equals( "null" ) ) |
| { |
| // The special case of a 'null' string |
| return null; |
| } |
| else |
| { |
| while ( strLength > strBuf.length() ) |
| { |
| strBuf.append( objectInput.readUTF() ); |
| } |
| return strBuf.toString(); |
| } |
| } |
| } |