lightning-core/src/main/java/org/apache/directmemory/lightning/internal/util/UnicodeUtil.java - directmemory-lightning - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.directmemory.lightning.internal.util;

 import java.io.IOException;
 import java.lang.reflect.Constructor;

 import org.apache.directmemory.lightning.Source;
 import org.apache.directmemory.lightning.Target;

 /*
  * This codebase is derived from the org.apache.lucene.util.UnicodeUtil class from Apache Lucene project.
  */

 /*
  * Some of this code came from the excellent Unicode
  * conversion examples from:
  *
  *   http://www.unicode.org/Public/PROGRAMS/CVTUTF
  *
  * Full Copyright for that code follows:
  */

 /*
  * Copyright 2001-2004 Unicode, Inc.
  *
  * Disclaimer
  *
  * This source code is provided as is by Unicode, Inc. No claims are
  * made as to fitness for any particular purpose. No warranties of any
  * kind are expressed or implied. The recipient agrees to determine
  * applicability of information provided. If this file has been
  * purchased on magnetic or optical media from Unicode, Inc., the
  * sole remedy for any claim will be exchange of defective media
  * within 90 days of receipt.
  *
  * Limitations on Rights to Redistribute This Code
  *
  * Unicode, Inc. hereby grants the right to freely use the information
  * supplied in this file in the creation of products supporting the
  * Unicode Standard, and to make copies of this file in any form
  * for internal or external distribution as long as this notice
  * remains attached.
  */

 /*
  * Additional code came from the IBM ICU library.
  *
  *  http://www.icu-project.org
  *
  * Full Copyright for that code follows.
  */

 /*
  * Copyright (C) 1999-2010, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
  * Software, and to permit persons to whom the Software is furnished to do so,
  * provided that the above copyright notice(s) and this permission notice appear
  * in all copies of the Software and that both the above copyright notice(s) and
  * this permission notice appear in supporting documentation.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
  * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
  * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  *
  * Except as contained in this notice, the name of a copyright holder shall not
  * be used in advertising or otherwise to promote the sale, use or other
  * dealings in this Software without prior written authorization of the
  * copyright holder.
  */

 /**
  * Class to encode java's UTF16 char[] into UTF8 byte[] without always allocating a new byte[] as
  * String.getBytes("UTF-8") does.
  */
 public final class UnicodeUtil
 {

     private UnicodeUtil()
     {
     } // no instance

     public static final int UNI_SUR_HIGH_START = 0xD800;

     public static final int UNI_SUR_HIGH_END = 0xDBFF;

     public static final int UNI_SUR_LOW_START = 0xDC00;

     public static final int UNI_SUR_LOW_END = 0xDFFF;

     public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;

     private static final long UNI_MAX_BMP = 0x0000FFFF;

     private static final long HALF_SHIFT = 10;

     private static final long HALF_MASK = 0x3FFL;

     private static final int SURROGATE_OFFSET = Character.MIN_SUPPLEMENTARY_CODE_POINT
         - ( UNI_SUR_HIGH_START << HALF_SHIFT ) - UNI_SUR_LOW_START;

     // Special String package private internal constructor for sharing char-array usage
     private static final Constructor<String> STRING_PP_CONSTRUCTOR;

     static
     {
         Constructor<String> constructor = null;
         try
         {
             constructor = String.class.getDeclaredConstructor( int.class, int.class, char[].class );
             constructor.setAccessible( true );
         }
         catch ( SecurityException e )
         {
             // intentionally left blank
         }
         catch ( NoSuchMethodException e )
         {
             // intentionally left blank
         }
         STRING_PP_CONSTRUCTOR = constructor;
     }

     /**
      * Encode characters from the given {@link String}, starting at offset 0 for length chars. Returns length of the
      * encoded String in bytes.
      *
      * @throws IOException
      */
     public static int UTF16toUTF8( String value, Target target )
         throws IOException
     {
         char[] characters = value.toCharArray();
         int length = characters.length;

         // Write string length to target
         target.writeInt( length );

         int i = 0;
         final int end = length;

         int writtenBytes = 0;
         while ( i < end )
         {

             final int code = characters[i++];

             if ( code < 0x80 )
             {
                 target.writeByte( (byte) code );
                 writtenBytes++;
             }
             else if ( code < 0x800 )
             {
                 target.writeByte( (byte) ( 0xC0 | ( code >> 6 ) ) );
                 target.writeByte( (byte) ( 0x80 | ( code & 0x3F ) ) );
                 writtenBytes += 2;
             }
             else if ( code < 0xD800 || code > 0xDFFF )
             {
                 target.writeByte( (byte) ( 0xE0 | ( code >> 12 ) ) );
                 target.writeByte( (byte) ( 0x80 | ( ( code >> 6 ) & 0x3F ) ) );
                 target.writeByte( (byte) ( 0x80 | ( code & 0x3F ) ) );
                 writtenBytes += 3;
             }
             else
             {
                 // surrogate pair
                 // confirm valid high surrogate
                 if ( code < 0xDC00 && i < end )
                 {
                     int utf32 = characters[i];
                     // confirm valid low surrogate and write pair
                     if ( utf32 >= 0xDC00 && utf32 <= 0xDFFF )
                     {
                         utf32 = ( code << 10 ) + utf32 + SURROGATE_OFFSET;
                         i++;
                         target.writeByte( (byte) ( 0xF0 | ( utf32 >> 18 ) ) );
                         target.writeByte( (byte) ( 0x80 | ( ( utf32 >> 12 ) & 0x3F ) ) );
                         target.writeByte( (byte) ( 0x80 | ( ( utf32 >> 6 ) & 0x3F ) ) );
                         target.writeByte( (byte) ( 0x80 | ( utf32 & 0x3F ) ) );
                         writtenBytes += 4;
                         continue;
                     }
                 }
                 // replace unpaired surrogate or out-of-order low surrogate
                 // with substitution character
                 target.writeByte( (byte) 0xEF );
                 target.writeByte( (byte) 0xBF );
                 target.writeByte( (byte) 0xBD );
                 writtenBytes += 3;
             }
         }
         return writtenBytes;
     }

     public static boolean validUTF16String( CharSequence s )
     {
         final int size = s.length();
         for ( int i = 0; i < size; i++ )
         {
             char ch = s.charAt( i );
             if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
             {
                 if ( i < size - 1 )
                 {
                     i++;
                     char nextCH = s.charAt( i );
                     if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
                     {
                         // Valid surrogate pair
                     }
                     else
                     {
                         // Unmatched high surrogate
                         return false;
                     }
                 }
                 else
                 {
                     // Unmatched high surrogate
                     return false;
                 }
             }
             else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
             {
                 // Unmatched low surrogate
                 return false;
             }
         }

         return true;
     }

     public static boolean validUTF16String( char[] s, int size )
     {
         for ( int i = 0; i < size; i++ )
         {
             char ch = s[i];
             if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
             {
                 if ( i < size - 1 )
                 {
                     i++;
                     char nextCH = s[i];
                     if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
                     {
                         // Valid surrogate pair
                     }
                     else
                     {
                         return false;
                     }
                 }
                 else
                 {
                     return false;
                 }
             }
             else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
             {
                 // Unmatched low surrogate
                 return false;
             }
         }

         return true;
     }

     /**
      * Interprets the bytes from the given {@link Source} as UTF-8 and converts to UTF-16.
      * <p>
      * NOTE: Full characters are read, even if this reads past the length passed (and can result in an
      * ArrayOutOfBoundsException if invalid UTF-8 is passed). Explicit checks for valid UTF-8 are not performed.
      *
      * @throws IOException
      */
     public static String UTF8toUTF16( Source source )
         throws IOException
     {
         int charLength = source.readInt();

         int offset = 0;
         final char[] out = new char[charLength];
         while ( offset < charLength )
         {
             int b = source.readByte() & 0xff;
             if ( b < 0xc0 )
             {
                 assert b < 0x80;
                 out[offset++] = (char) b;
             }
             else if ( b < 0xe0 )
             {
                 out[offset++] = (char) ( ( ( b & 0x1f ) << 6 ) + ( source.readByte() & 0x3f ) );
             }
             else if ( b < 0xf0 )
             {
                 out[offset++] =
                     (char) ( ( ( b & 0xf ) << 12 ) + ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f ) );
             }
             else
             {
                 assert b < 0xf8 : "b = 0x" + Integer.toHexString( b );
                 int ch =
                     ( ( b & 0x7 ) << 18 ) + ( ( source.readByte() & 0x3f ) << 12 )
                         + ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f );
                 if ( ch < UNI_MAX_BMP )
                 {
                     out[offset++] = (char) ch;
                 }
                 else
                 {
                     int chHalf = ch - 0x0010000;
                     out[offset++] = (char) ( ( chHalf >> 10 ) + 0xD800 );
                     out[offset++] = (char) ( ( chHalf & HALF_MASK ) + 0xDC00 );
                 }
             }
         }

         if ( STRING_PP_CONSTRUCTOR != null )
         {
             return new String( out );
         }
         else
         {
             try
             {
                 return STRING_PP_CONSTRUCTOR.newInstance( 0, out.length, out );
             }
             catch ( Exception e )
             {
                 return new String( out );
             }
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.directmemory.lightning.internal.util;

	import java.io.IOException;
	import java.lang.reflect.Constructor;

	import org.apache.directmemory.lightning.Source;
	import org.apache.directmemory.lightning.Target;

	/*
	* This codebase is derived from the org.apache.lucene.util.UnicodeUtil class from Apache Lucene project.
	*/

	/*
	* Some of this code came from the excellent Unicode
	* conversion examples from:
	*
	* http://www.unicode.org/Public/PROGRAMS/CVTUTF
	*
	* Full Copyright for that code follows:
	*/

	/*
	* Copyright 2001-2004 Unicode, Inc.
	*
	* Disclaimer
	*
	* This source code is provided as is by Unicode, Inc. No claims are
	* made as to fitness for any particular purpose. No warranties of any
	* kind are expressed or implied. The recipient agrees to determine
	* applicability of information provided. If this file has been
	* purchased on magnetic or optical media from Unicode, Inc., the
	* sole remedy for any claim will be exchange of defective media
	* within 90 days of receipt.
	*
	* Limitations on Rights to Redistribute This Code
	*
	* Unicode, Inc. hereby grants the right to freely use the information
	* supplied in this file in the creation of products supporting the
	* Unicode Standard, and to make copies of this file in any form
	* for internal or external distribution as long as this notice
	* remains attached.
	*/

	/*
	* Additional code came from the IBM ICU library.
	*
	* http://www.icu-project.org
	*
	* Full Copyright for that code follows.
	*/

	/*
	* Copyright (C) 1999-2010, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
	* Software, and to permit persons to whom the Software is furnished to do so,
	* provided that the above copyright notice(s) and this permission notice appear
	* in all copies of the Software and that both the above copyright notice(s) and
	* this permission notice appear in supporting documentation.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
	* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
	* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
	* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
	* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	*
	* Except as contained in this notice, the name of a copyright holder shall not
	* be used in advertising or otherwise to promote the sale, use or other
	* dealings in this Software without prior written authorization of the
	* copyright holder.
	*/

	/**
	* Class to encode java's UTF16 char[] into UTF8 byte[] without always allocating a new byte[] as
	* String.getBytes("UTF-8") does.
	*/
	public final class UnicodeUtil
	{

	private UnicodeUtil()
	{
	} // no instance

	public static final int UNI_SUR_HIGH_START = 0xD800;

	public static final int UNI_SUR_HIGH_END = 0xDBFF;

	public static final int UNI_SUR_LOW_START = 0xDC00;

	public static final int UNI_SUR_LOW_END = 0xDFFF;

	public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;

	private static final long UNI_MAX_BMP = 0x0000FFFF;

	private static final long HALF_SHIFT = 10;

	private static final long HALF_MASK = 0x3FFL;

	private static final int SURROGATE_OFFSET = Character.MIN_SUPPLEMENTARY_CODE_POINT
	- ( UNI_SUR_HIGH_START << HALF_SHIFT ) - UNI_SUR_LOW_START;

	// Special String package private internal constructor for sharing char-array usage
	private static final Constructor<String> STRING_PP_CONSTRUCTOR;

	static
	{
	Constructor<String> constructor = null;
	try
	{
	constructor = String.class.getDeclaredConstructor( int.class, int.class, char[].class );
	constructor.setAccessible( true );
	}
	catch ( SecurityException e )
	{
	// intentionally left blank
	}
	catch ( NoSuchMethodException e )
	{
	// intentionally left blank
	}
	STRING_PP_CONSTRUCTOR = constructor;
	}

	/**
	* Encode characters from the given {@link String}, starting at offset 0 for length chars. Returns length of the
	* encoded String in bytes.
	*
	* @throws IOException
	*/
	public static int UTF16toUTF8( String value, Target target )
	throws IOException
	{
	char[] characters = value.toCharArray();
	int length = characters.length;

	// Write string length to target
	target.writeInt( length );

	int i = 0;
	final int end = length;

	int writtenBytes = 0;
	while ( i < end )
	{

	final int code = characters[i++];

	if ( code < 0x80 )
	{
	target.writeByte( (byte) code );
	writtenBytes++;
	}
	else if ( code < 0x800 )
	{
	target.writeByte( (byte) ( 0xC0 \| ( code >> 6 ) ) );
	target.writeByte( (byte) ( 0x80 \| ( code & 0x3F ) ) );
	writtenBytes += 2;
	}
	else if ( code < 0xD800 \|\| code > 0xDFFF )
	{
	target.writeByte( (byte) ( 0xE0 \| ( code >> 12 ) ) );
	target.writeByte( (byte) ( 0x80 \| ( ( code >> 6 ) & 0x3F ) ) );
	target.writeByte( (byte) ( 0x80 \| ( code & 0x3F ) ) );
	writtenBytes += 3;
	}
	else
	{
	// surrogate pair
	// confirm valid high surrogate
	if ( code < 0xDC00 && i < end )
	{
	int utf32 = characters[i];
	// confirm valid low surrogate and write pair
	if ( utf32 >= 0xDC00 && utf32 <= 0xDFFF )
	{
	utf32 = ( code << 10 ) + utf32 + SURROGATE_OFFSET;
	i++;
	target.writeByte( (byte) ( 0xF0 \| ( utf32 >> 18 ) ) );
	target.writeByte( (byte) ( 0x80 \| ( ( utf32 >> 12 ) & 0x3F ) ) );
	target.writeByte( (byte) ( 0x80 \| ( ( utf32 >> 6 ) & 0x3F ) ) );
	target.writeByte( (byte) ( 0x80 \| ( utf32 & 0x3F ) ) );
	writtenBytes += 4;
	continue;
	}
	}
	// replace unpaired surrogate or out-of-order low surrogate
	// with substitution character
	target.writeByte( (byte) 0xEF );
	target.writeByte( (byte) 0xBF );
	target.writeByte( (byte) 0xBD );
	writtenBytes += 3;
	}
	}
	return writtenBytes;
	}

	public static boolean validUTF16String( CharSequence s )
	{
	final int size = s.length();
	for ( int i = 0; i < size; i++ )
	{
	char ch = s.charAt( i );
	if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
	{
	if ( i < size - 1 )
	{
	i++;
	char nextCH = s.charAt( i );
	if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
	{
	// Valid surrogate pair
	}
	else
	{
	// Unmatched high surrogate
	return false;
	}
	}
	else
	{
	// Unmatched high surrogate
	return false;
	}
	}
	else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
	{
	// Unmatched low surrogate
	return false;
	}
	}

	return true;
	}

	public static boolean validUTF16String( char[] s, int size )
	{
	for ( int i = 0; i < size; i++ )
	{
	char ch = s[i];
	if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
	{
	if ( i < size - 1 )
	{
	i++;
	char nextCH = s[i];
	if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
	{
	// Valid surrogate pair
	}
	else
	{
	return false;
	}
	}
	else
	{
	return false;
	}
	}
	else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
	{
	// Unmatched low surrogate
	return false;
	}
	}

	return true;
	}

	/**
	* Interprets the bytes from the given {@link Source} as UTF-8 and converts to UTF-16.
	* <p>
	* NOTE: Full characters are read, even if this reads past the length passed (and can result in an
	* ArrayOutOfBoundsException if invalid UTF-8 is passed). Explicit checks for valid UTF-8 are not performed.
	*
	* @throws IOException
	*/
	public static String UTF8toUTF16( Source source )
	throws IOException
	{
	int charLength = source.readInt();

	int offset = 0;
	final char[] out = new char[charLength];
	while ( offset < charLength )
	{
	int b = source.readByte() & 0xff;
	if ( b < 0xc0 )
	{
	assert b < 0x80;
	out[offset++] = (char) b;
	}
	else if ( b < 0xe0 )
	{
	out[offset++] = (char) ( ( ( b & 0x1f ) << 6 ) + ( source.readByte() & 0x3f ) );
	}
	else if ( b < 0xf0 )
	{
	out[offset++] =
	(char) ( ( ( b & 0xf ) << 12 ) + ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f ) );
	}
	else
	{
	assert b < 0xf8 : "b = 0x" + Integer.toHexString( b );
	int ch =
	( ( b & 0x7 ) << 18 ) + ( ( source.readByte() & 0x3f ) << 12 )
	+ ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f );
	if ( ch < UNI_MAX_BMP )
	{
	out[offset++] = (char) ch;
	}
	else
	{
	int chHalf = ch - 0x0010000;
	out[offset++] = (char) ( ( chHalf >> 10 ) + 0xD800 );
	out[offset++] = (char) ( ( chHalf & HALF_MASK ) + 0xDC00 );
	}
	}
	}

	if ( STRING_PP_CONSTRUCTOR != null )
	{
	return new String( out );
	}
	else
	{
	try
	{
	return STRING_PP_CONSTRUCTOR.newInstance( 0, out.length, out );
	}
	catch ( Exception e )
	{
	return new String( out );
	}
	}
	}

	}