util/src/main/java/org/apache/directory/api/util/Unicode.java - directory-ldap-api - Git at Google

 /*
  *  Licensed to the Apache Software Foundation (ASF) under one
  *  or more contributor license agreements.  See the NOTICE file
  *  distributed with this work for additional information
  *  regarding copyright ownership.  The ASF licenses this file
  *  to you under the Apache License, Version 2.0 (the
  *  "License"); you may not use this file except in compliance
  *  with the License.  You may obtain a copy of the License at
  *
  *    https://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing,
  *  software distributed under the License is distributed on an
  *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  *  KIND, either express or implied.  See the License for the
  *  specific language governing permissions and limitations
  *  under the License.
  *
  */
 package org.apache.directory.api.util;


 import java.io.IOException;
 import java.io.ObjectInput;
 import java.io.ObjectOutput;


 /**
  * Various unicode manipulation methods that are more efficient then chaining
  * operations: all is done in the same buffer without creating a bunch of string
  * objects.
  *
  * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
  */
 public final class Unicode
 {
     private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
     private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
     private static final int UTF8_TWO_BYTES = 0x00C0;
     private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
     private static final int UTF8_THREE_BYTES = 0x00E0;
     private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
     private static final int UTF8_FOUR_BYTES = 0x00F0;
     private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
     private static final int UTF8_FIVE_BYTES = 0x00F8;
     private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
     private static final int UTF8_SIX_BYTES = 0x00FC;

     /** %01-%27 %2B-%5B %5D-%7F */
     private static final boolean[] UNICODE_SUBSET =
         {
             // '\0'
             false, true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             // '(', ')', '*'
             false, false, false, true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             // '\'
             true,  true,  true,  true,  false, true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
             true,  true,  true,  true,  true,  true,  true,  true,
         };
     private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
     private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
     private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
     private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;

     private Unicode()
     {
     }

     /**
      * Count the number of bytes needed to return an Unicode char. This can be
      * from 1 to 6.
      *
      * @param bytes The bytes to read
      * @param pos Position to start counting. It must be a valid start of a
      *            encoded char !
      * @return The number of bytes to create a char, or -1 if the encoding is
      *         wrong. TODO : Should stop after the third byte, as a char is only
      *         2 bytes long.
      */
     public static int countBytesPerChar( byte[] bytes, int pos )
     {
         if ( bytes == null )
         {
             return -1;
         }

         if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
         {
             return 1;
         }
         else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
         {
             return 2;
         }
         else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
         {
             return 3;
         }
         else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
         {
             return 4;
         }
         else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
         {
             return 5;
         }
         else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
         {
             return 6;
         }
         else
         {
             return -1;
         }
     }


     /**
      * Return the Unicode char which is coded in the bytes at position 0.
      *
      * @param bytes The byte[] represntation of an Unicode string.
      * @return The first char found.
      */
     public static char bytesToChar( byte[] bytes )
     {
         return bytesToChar( bytes, 0 );
     }


     /**
      * Return the Unicode char which is coded in the bytes at the given
      * position.
      *
      * @param bytes The byte[] represntation of an Unicode string.
      * @param pos The current position to start decoding the char
      * @return The decoded char, or -1 if no char can be decoded TODO : Should
      *         stop after the third byte, as a char is only 2 bytes long.
      */
     public static char bytesToChar( byte[] bytes, int pos )
     {
         if ( bytes == null )
         {
             return ( char ) -1;
         }

         if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
         {
             return ( char ) bytes[pos];
         }
         else
         {
             if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
             {
                 // Two bytes char
                 // 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
                 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
             }
             else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
             {
                 // Three bytes char
                 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
                 return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
                     + ( ( bytes[pos + 1] & 0x3C ) << 6 )
                     + ( ( bytes[pos + 1] & 0x03 ) << 6 )
                     + ( bytes[pos + 2] & 0x3F )
                 );
             }
             else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
             {
                 // Four bytes char
                 return ( char ) (
                 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
                 ( ( bytes[pos] & 0x07 ) << 18 )
                     + ( ( bytes[pos + 1] & 0x30 ) << 16 )
                     + ( ( bytes[pos + 1] & 0x0F ) << 12 )
                     + ( ( bytes[pos + 2] & 0x3C ) << 6 )
                     + ( ( bytes[pos + 2] & 0x03 ) << 6 )
                     + ( bytes[pos + 3] & 0x3F )
                 );
             }
             else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
             {
                 // Five bytes char
                 return ( char ) (
                 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
                 // 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
                 ( ( bytes[pos] & 0x03 ) << 24 )
                     + ( ( bytes[pos + 1] & 0x3F ) << 18 )
                     + ( ( bytes[pos + 2] & 0x30 ) << 12 )
                     + ( ( bytes[pos + 2] & 0x0F ) << 12 )
                     + ( ( bytes[pos + 3] & 0x3C ) << 6 )
                     + ( ( bytes[pos + 3] & 0x03 ) << 6 )
                     + ( bytes[pos + 4] & 0x3F )
                 );
             }
             else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
             {
                 // Six bytes char
                 return ( char ) (
                 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
                 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
                 ( ( bytes[pos] & 0x01 ) << 30 )
                     + ( ( bytes[pos + 1] & 0x3F ) << 24 )
                     + ( ( bytes[pos + 2] & 0x3F ) << 18 )
                     + ( ( bytes[pos + 3] & 0x30 ) << 12 )
                     + ( ( bytes[pos + 3] & 0x0F ) << 12 )
                     + ( ( bytes[pos + 4] & 0x3C ) << 6 )
                     + ( ( bytes[pos + 4] & 0x03 ) << 6 )
                     + ( bytes[pos + 5] & 0x3F )
                 );
             }
             else
             {
                 return ( char ) -1;
             }
         }
     }


     /**
      * Return the number of bytes that hold an Unicode char.
      *
      * @param car The character to be decoded
      * @return The number of bytes to hold the char. TODO : Should stop after
      *         the third byte, as a char is only 2 bytes long.
      */
     public static int countNbBytesPerChar( char car )
     {
         if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
         {
             return 1;
         }
         else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
         {
             return 2;
         }
         else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
         {
             return 3;
         }
         else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
         {
             return 4;
         }
         else
         {
             return -1;
         }
     }


     /**
      * Count the number of bytes included in the given char[].
      *
      * @param chars The char array to decode
      * @return The number of bytes in the char array
      */
     public static int countBytes( char[] chars )
     {
         if ( chars == null )
         {
             return 0;
         }

         int nbBytes = 0;
         int currentPos = 0;

         while ( currentPos < chars.length )
         {
             int nbb = countNbBytesPerChar( chars[currentPos] );

             // If the number of bytes necessary to encode a character is
             // above 3, we will need two UTF-16 chars
             currentPos += ( nbb < 4 ? 1 : 2 );
             nbBytes += nbb;
         }

         return nbBytes;
     }


     /**
      * Count the number of chars included in the given byte[].
      *
      * @param bytes The byte array to decode
      * @return The number of char in the byte array
      */
     public static int countChars( byte[] bytes )
     {
         if ( bytes == null )
         {
             return 0;
         }

         int nbChars = 0;
         int currentPos = 0;

         while ( currentPos < bytes.length )
         {
             currentPos += countBytesPerChar( bytes, currentPos );
             nbChars++;
         }

         return nbChars;
     }


     /**
      * Return the Unicode char which is coded in the bytes at the given
      * position.
      *
      * @param car The character to be transformed to an array of bytes
      *
      * @return The byte array representing the char
      *
      * TODO : Should stop after the third byte, as a char is only 2 bytes long.
      */
     public static byte[] charToBytes( char car )
     {
         if ( car <= 0x007F )
         {
             byte[] bytes = new byte[1];

             // Single byte char
             bytes[0] = ( byte ) car;

             return bytes;
         }
         else if ( car <= 0x07FF )
         {
             byte[] bytes = new byte[2];

             // two bytes char
             bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
             bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

             return bytes;
         }
         else
         {
             byte[] bytes = new byte[3];

             // Three bytes char
             bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
             bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
             bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

             return bytes;
         }
     }


     /**
      * Check if the current char is in the unicodeSubset : all chars but
      * '\0', '(', ')', '*' and '\'
      *
      * @param str The string to check
      * @param pos Position of the current char
      * @return True if the current char is in the unicode subset
      */
     public static boolean isUnicodeSubset( String str, int pos )
     {
         if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
         {
             return false;
         }

         char c = str.charAt( pos );

         return ( c > 127 ) || UNICODE_SUBSET[c];
     }


     /**
      * Check if the current char is in the unicodeSubset : all chars but
      * '\0', '(', ')', '*' and '\'
      *
      * @param c The char to check
      * @return True if the current char is in the unicode subset
      */
     public static boolean isUnicodeSubset( char c )
     {
         return ( c > 127 ) || UNICODE_SUBSET[c];
     }


     /**
      * Check if the current byte is in the unicodeSubset : all chars but
      * '\0', '(', ')', '*' and '\'
      *
      * @param b The byte to check
      * @return True if the current byte is in the unicode subset
      */
     public static boolean isUnicodeSubset( byte b )
     {
         return ( b < 0 ) || ( b > 127 ) || UNICODE_SUBSET[b];
     }


     /**
      *
      * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
      * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
      * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
      * two, or three bytes, depending on the value of the character.
      *
      * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
      * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
      * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
      * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
      * once.
      *
      * See also {@link java.io.DataOutput#writeUTF(String)}.
      *
      * @param objectOutput The objectOutput to write to
      * @param str The value to write
      * @throws java.io.IOException If the value can't be written to the file
      */
     public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
     {
         // Write a 'null' string
         if ( str == null )
         {
             objectOutput.writeInt( 0 );
             objectOutput.writeUTF( "null" );
         }
         else
         {
             // Write length of string
             objectOutput.writeInt( str.length() );

             StringBuilder strBuf = new StringBuilder( str );

             // Write the string in portions not larger than 21845 characters
             while ( strBuf != null )
             {
                 if ( strBuf.length() < 21845 )
                 {
                     objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
                     strBuf = null;
                 }
                 else
                 {
                     objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
                     strBuf.delete( 0, 21845 );
                 }
             }
         }
     }


     /**
      *
      * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
      * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
      * characters is then returned as a String.
      *
      * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
      * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
      * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
      * length of each group is computed from the value of the first byte of the group. The byte following a group, if
      * any, is the first byte of the next group.
      *
      *See also {@link java.io.DataInput#readUTF()}.
      *
      * @param objectInput The objectInput to read from
      * @return The read string
      * @throws java.io.IOException If the value can't be read
      */
     public static String readUTF( ObjectInput objectInput ) throws IOException
     {
         // Read length of the string
         int strLength = objectInput.readInt();

         // Start reading the string
         StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );

         if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) )
         {
             // The special case of a 'null' string
             return null;
         }
         else
         {
             while ( strLength > strBuf.length() )
             {
                 strBuf.append( objectInput.readUTF() );
             }
             return strBuf.toString();
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*/
	package org.apache.directory.api.util;


	import java.io.IOException;
	import java.io.ObjectInput;
	import java.io.ObjectOutput;


	/**
	* Various unicode manipulation methods that are more efficient then chaining
	* operations: all is done in the same buffer without creating a bunch of string
	* objects.
	*
	* @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
	*/
	public final class Unicode
	{
	private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
	private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
	private static final int UTF8_TWO_BYTES = 0x00C0;
	private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
	private static final int UTF8_THREE_BYTES = 0x00E0;
	private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
	private static final int UTF8_FOUR_BYTES = 0x00F0;
	private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
	private static final int UTF8_FIVE_BYTES = 0x00F8;
	private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
	private static final int UTF8_SIX_BYTES = 0x00FC;

	/** %01-%27 %2B-%5B %5D-%7F */
	private static final boolean[] UNICODE_SUBSET =
	{
	// '\0'
	false, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	// '(', ')', '*'
	false, false, false, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	// '\'
	true, true, true, true, false, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	true, true, true, true, true, true, true, true,
	};
	private static final int CHAR_ONE_BYTE_MASK = 0xFFFFFF80;
	private static final int CHAR_TWO_BYTES_MASK = 0xFFFFF800;
	private static final int CHAR_THREE_BYTES_MASK = 0xFFFF0000;
	private static final int CHAR_FOUR_BYTES_MASK = 0xFFE00000;

	private Unicode()
	{
	}

	/**
	* Count the number of bytes needed to return an Unicode char. This can be
	* from 1 to 6.
	*
	* @param bytes The bytes to read
	* @param pos Position to start counting. It must be a valid start of a
	* encoded char !
	* @return The number of bytes to create a char, or -1 if the encoding is
	* wrong. TODO : Should stop after the third byte, as a char is only
	* 2 bytes long.
	*/
	public static int countBytesPerChar( byte[] bytes, int pos )
	{
	if ( bytes == null )
	{
	return -1;
	}

	if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
	{
	return 1;
	}
	else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
	{
	return 2;
	}
	else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
	{
	return 3;
	}
	else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
	{
	return 4;
	}
	else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
	{
	return 5;
	}
	else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
	{
	return 6;
	}
	else
	{
	return -1;
	}
	}


	/**
	* Return the Unicode char which is coded in the bytes at position 0.
	*
	* @param bytes The byte[] represntation of an Unicode string.
	* @return The first char found.
	*/
	public static char bytesToChar( byte[] bytes )
	{
	return bytesToChar( bytes, 0 );
	}


	/**
	* Return the Unicode char which is coded in the bytes at the given
	* position.
	*
	* @param bytes The byte[] represntation of an Unicode string.
	* @param pos The current position to start decoding the char
	* @return The decoded char, or -1 if no char can be decoded TODO : Should
	* stop after the third byte, as a char is only 2 bytes long.
	*/
	public static char bytesToChar( byte[] bytes, int pos )
	{
	if ( bytes == null )
	{
	return ( char ) -1;
	}

	if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
	{
	return ( char ) bytes[pos];
	}
	else
	{
	if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
	{
	// Two bytes char
	// 110x-xxyy 10zz-zzzz -> 0000-0xxx yyzz-zzzz
	return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + ( ( bytes[pos] & 0x03 ) << 6 ) + ( bytes[pos + 1] & 0x3F ) );
	}
	else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
	{
	// Three bytes char
	// 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-xxxx yyzz-zzzz (FF FF)
	return ( char ) ( ( ( bytes[pos] & 0x0F ) << 12 )
	+ ( ( bytes[pos + 1] & 0x3C ) << 6 )
	+ ( ( bytes[pos + 1] & 0x03 ) << 6 )
	+ ( bytes[pos + 2] & 0x3F )
	);
	}
	else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
	{
	// Four bytes char
	return ( char ) (
	// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
	( ( bytes[pos] & 0x07 ) << 18 )
	+ ( ( bytes[pos + 1] & 0x30 ) << 16 )
	+ ( ( bytes[pos + 1] & 0x0F ) << 12 )
	+ ( ( bytes[pos + 2] & 0x3C ) << 6 )
	+ ( ( bytes[pos + 2] & 0x03 ) << 6 )
	+ ( bytes[pos + 3] & 0x3F )
	);
	}
	else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
	{
	// Five bytes char
	return ( char ) (
	// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
	// 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
	( ( bytes[pos] & 0x03 ) << 24 )
	+ ( ( bytes[pos + 1] & 0x3F ) << 18 )
	+ ( ( bytes[pos + 2] & 0x30 ) << 12 )
	+ ( ( bytes[pos + 2] & 0x0F ) << 12 )
	+ ( ( bytes[pos + 3] & 0x3C ) << 6 )
	+ ( ( bytes[pos + 3] & 0x03 ) << 6 )
	+ ( bytes[pos + 4] & 0x3F )
	);
	}
	else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
	{
	// Six bytes char
	return ( char ) (
	// 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
	// -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
	( ( bytes[pos] & 0x01 ) << 30 )
	+ ( ( bytes[pos + 1] & 0x3F ) << 24 )
	+ ( ( bytes[pos + 2] & 0x3F ) << 18 )
	+ ( ( bytes[pos + 3] & 0x30 ) << 12 )
	+ ( ( bytes[pos + 3] & 0x0F ) << 12 )
	+ ( ( bytes[pos + 4] & 0x3C ) << 6 )
	+ ( ( bytes[pos + 4] & 0x03 ) << 6 )
	+ ( bytes[pos + 5] & 0x3F )
	);
	}
	else
	{
	return ( char ) -1;
	}
	}
	}


	/**
	* Return the number of bytes that hold an Unicode char.
	*
	* @param car The character to be decoded
	* @return The number of bytes to hold the char. TODO : Should stop after
	* the third byte, as a char is only 2 bytes long.
	*/
	public static int countNbBytesPerChar( char car )
	{
	if ( ( car & CHAR_ONE_BYTE_MASK ) == 0 )
	{
	return 1;
	}
	else if ( ( car & CHAR_TWO_BYTES_MASK ) == 0 )
	{
	return 2;
	}
	else if ( ( car & CHAR_THREE_BYTES_MASK ) == 0 )
	{
	return 3;
	}
	else if ( ( car & CHAR_FOUR_BYTES_MASK ) == 0 )
	{
	return 4;
	}
	else
	{
	return -1;
	}
	}


	/**
	* Count the number of bytes included in the given char[].
	*
	* @param chars The char array to decode
	* @return The number of bytes in the char array
	*/
	public static int countBytes( char[] chars )
	{
	if ( chars == null )
	{
	return 0;
	}

	int nbBytes = 0;
	int currentPos = 0;

	while ( currentPos < chars.length )
	{
	int nbb = countNbBytesPerChar( chars[currentPos] );

	// If the number of bytes necessary to encode a character is
	// above 3, we will need two UTF-16 chars
	currentPos += ( nbb < 4 ? 1 : 2 );
	nbBytes += nbb;
	}

	return nbBytes;
	}


	/**
	* Count the number of chars included in the given byte[].
	*
	* @param bytes The byte array to decode
	* @return The number of char in the byte array
	*/
	public static int countChars( byte[] bytes )
	{
	if ( bytes == null )
	{
	return 0;
	}

	int nbChars = 0;
	int currentPos = 0;

	while ( currentPos < bytes.length )
	{
	currentPos += countBytesPerChar( bytes, currentPos );
	nbChars++;
	}

	return nbChars;
	}


	/**
	* Return the Unicode char which is coded in the bytes at the given
	* position.
	*
	* @param car The character to be transformed to an array of bytes
	*
	* @return The byte array representing the char
	*
	* TODO : Should stop after the third byte, as a char is only 2 bytes long.
	*/
	public static byte[] charToBytes( char car )
	{
	if ( car <= 0x007F )
	{
	byte[] bytes = new byte[1];

	// Single byte char
	bytes[0] = ( byte ) car;

	return bytes;
	}
	else if ( car <= 0x07FF )
	{
	byte[] bytes = new byte[2];

	// two bytes char
	bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
	bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

	return bytes;
	}
	else
	{
	byte[] bytes = new byte[3];

	// Three bytes char
	bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
	bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
	bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );

	return bytes;
	}
	}


	/**
	* Check if the current char is in the unicodeSubset : all chars but
	* '\0', '(', ')', '*' and '\'
	*
	* @param str The string to check
	* @param pos Position of the current char
	* @return True if the current char is in the unicode subset
	*/
	public static boolean isUnicodeSubset( String str, int pos )
	{
	if ( ( str == null ) \|\| ( str.length() <= pos ) \|\| ( pos < 0 ) )
	{
	return false;
	}

	char c = str.charAt( pos );

	return ( c > 127 ) \|\| UNICODE_SUBSET[c];
	}


	/**
	* Check if the current char is in the unicodeSubset : all chars but
	* '\0', '(', ')', '*' and '\'
	*
	* @param c The char to check
	* @return True if the current char is in the unicode subset
	*/
	public static boolean isUnicodeSubset( char c )
	{
	return ( c > 127 ) \|\| UNICODE_SUBSET[c];
	}


	/**
	* Check if the current byte is in the unicodeSubset : all chars but
	* '\0', '(', ')', '*' and '\'
	*
	* @param b The byte to check
	* @return True if the current byte is in the unicode subset
	*/
	public static boolean isUnicodeSubset( byte b )
	{
	return ( b < 0 ) \|\| ( b > 127 ) \|\| UNICODE_SUBSET[b];
	}


	/**
	*
	* Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
	* of every character in the string str. If str is null, the string value 'null' is written with a length of 0
	* instead of throwing an NullPointerException. Each character in the string s is converted to a group of one,
	* two, or three bytes, depending on the value of the character.
	*
	* Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
	* written in the length information (four bytes (writeInt)) and the string is split into smaller parts
	* if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
	* can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
	* once.
	*
	* See also {@link java.io.DataOutput#writeUTF(String)}.
	*
	* @param objectOutput The objectOutput to write to
	* @param str The value to write
	* @throws java.io.IOException If the value can't be written to the file
	*/
	public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
	{
	// Write a 'null' string
	if ( str == null )
	{
	objectOutput.writeInt( 0 );
	objectOutput.writeUTF( "null" );
	}
	else
	{
	// Write length of string
	objectOutput.writeInt( str.length() );

	StringBuilder strBuf = new StringBuilder( str );

	// Write the string in portions not larger than 21845 characters
	while ( strBuf != null )
	{
	if ( strBuf.length() < 21845 )
	{
	objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
	strBuf = null;
	}
	else
	{
	objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
	strBuf.delete( 0, 21845 );
	}
	}
	}
	}


	/**
	*
	* Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is
	* that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
	* characters is then returned as a String.
	*
	* First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
	* of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of
	* additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
	* length of each group is computed from the value of the first byte of the group. The byte following a group, if
	* any, is the first byte of the next group.
	*
	*See also {@link java.io.DataInput#readUTF()}.
	*
	* @param objectInput The objectInput to read from
	* @return The read string
	* @throws java.io.IOException If the value can't be read
	*/
	public static String readUTF( ObjectInput objectInput ) throws IOException
	{
	// Read length of the string
	int strLength = objectInput.readInt();

	// Start reading the string
	StringBuilder strBuf = new StringBuilder( objectInput.readUTF() );

	if ( ( strLength == 0 ) && ( "null".equals( strBuf.toString() ) ) )
	{
	// The special case of a 'null' string
	return null;
	}
	else
	{
	while ( strLength > strBuf.length() )
	{
	strBuf.append( objectInput.readUTF() );
	}
	return strBuf.toString();
	}
	}
	}