blob: 4788305fd07d6385da5c7e3790c9aeac3ea36b06 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.directmemory.lightning.internal.util;
import java.io.IOException;
import java.lang.reflect.Constructor;
import org.apache.directmemory.lightning.Source;
import org.apache.directmemory.lightning.Target;
/*
* This codebase is derived from the org.apache.lucene.util.UnicodeUtil class from Apache Lucene project.
*/
/*
* Some of this code came from the excellent Unicode
* conversion examples from:
*
* http://www.unicode.org/Public/PROGRAMS/CVTUTF
*
* Full Copyright for that code follows:
*/
/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/
/*
* Additional code came from the IBM ICU library.
*
* http://www.icu-project.org
*
* Full Copyright for that code follows.
*/
/*
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* provided that the above copyright notice(s) and this permission notice appear
* in all copies of the Software and that both the above copyright notice(s) and
* this permission notice appear in supporting documentation.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder shall not
* be used in advertising or otherwise to promote the sale, use or other
* dealings in this Software without prior written authorization of the
* copyright holder.
*/
/**
* Class to encode java's UTF16 char[] into UTF8 byte[] without always allocating a new byte[] as
* String.getBytes("UTF-8") does.
*/
public final class UnicodeUtil
{
private UnicodeUtil()
{
} // no instance
public static final int UNI_SUR_HIGH_START = 0xD800;
public static final int UNI_SUR_HIGH_END = 0xDBFF;
public static final int UNI_SUR_LOW_START = 0xDC00;
public static final int UNI_SUR_LOW_END = 0xDFFF;
public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
private static final long UNI_MAX_BMP = 0x0000FFFF;
private static final long HALF_SHIFT = 10;
private static final long HALF_MASK = 0x3FFL;
private static final int SURROGATE_OFFSET = Character.MIN_SUPPLEMENTARY_CODE_POINT
- ( UNI_SUR_HIGH_START << HALF_SHIFT ) - UNI_SUR_LOW_START;
// Special String package private internal constructor for sharing char-array usage
private static final Constructor<String> STRING_PP_CONSTRUCTOR;
static
{
Constructor<String> constructor = null;
try
{
constructor = String.class.getDeclaredConstructor( int.class, int.class, char[].class );
constructor.setAccessible( true );
}
catch ( SecurityException e )
{
// intentionally left blank
}
catch ( NoSuchMethodException e )
{
// intentionally left blank
}
STRING_PP_CONSTRUCTOR = constructor;
}
/**
* Encode characters from the given {@link String}, starting at offset 0 for length chars. Returns length of the
* encoded String in bytes.
*
* @throws IOException
*/
public static int UTF16toUTF8( String value, Target target )
throws IOException
{
char[] characters = value.toCharArray();
int length = characters.length;
// Write string length to target
target.writeInt( length );
int i = 0;
final int end = length;
int writtenBytes = 0;
while ( i < end )
{
final int code = characters[i++];
if ( code < 0x80 )
{
target.writeByte( (byte) code );
writtenBytes++;
}
else if ( code < 0x800 )
{
target.writeByte( (byte) ( 0xC0 | ( code >> 6 ) ) );
target.writeByte( (byte) ( 0x80 | ( code & 0x3F ) ) );
writtenBytes += 2;
}
else if ( code < 0xD800 || code > 0xDFFF )
{
target.writeByte( (byte) ( 0xE0 | ( code >> 12 ) ) );
target.writeByte( (byte) ( 0x80 | ( ( code >> 6 ) & 0x3F ) ) );
target.writeByte( (byte) ( 0x80 | ( code & 0x3F ) ) );
writtenBytes += 3;
}
else
{
// surrogate pair
// confirm valid high surrogate
if ( code < 0xDC00 && i < end )
{
int utf32 = characters[i];
// confirm valid low surrogate and write pair
if ( utf32 >= 0xDC00 && utf32 <= 0xDFFF )
{
utf32 = ( code << 10 ) + utf32 + SURROGATE_OFFSET;
i++;
target.writeByte( (byte) ( 0xF0 | ( utf32 >> 18 ) ) );
target.writeByte( (byte) ( 0x80 | ( ( utf32 >> 12 ) & 0x3F ) ) );
target.writeByte( (byte) ( 0x80 | ( ( utf32 >> 6 ) & 0x3F ) ) );
target.writeByte( (byte) ( 0x80 | ( utf32 & 0x3F ) ) );
writtenBytes += 4;
continue;
}
}
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
target.writeByte( (byte) 0xEF );
target.writeByte( (byte) 0xBF );
target.writeByte( (byte) 0xBD );
writtenBytes += 3;
}
}
return writtenBytes;
}
public static boolean validUTF16String( CharSequence s )
{
final int size = s.length();
for ( int i = 0; i < size; i++ )
{
char ch = s.charAt( i );
if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
{
if ( i < size - 1 )
{
i++;
char nextCH = s.charAt( i );
if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
{
// Valid surrogate pair
}
else
{
// Unmatched high surrogate
return false;
}
}
else
{
// Unmatched high surrogate
return false;
}
}
else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
{
// Unmatched low surrogate
return false;
}
}
return true;
}
public static boolean validUTF16String( char[] s, int size )
{
for ( int i = 0; i < size; i++ )
{
char ch = s[i];
if ( ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END )
{
if ( i < size - 1 )
{
i++;
char nextCH = s[i];
if ( nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END )
{
// Valid surrogate pair
}
else
{
return false;
}
}
else
{
return false;
}
}
else if ( ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END )
{
// Unmatched low surrogate
return false;
}
}
return true;
}
/**
* Interprets the bytes from the given {@link Source} as UTF-8 and converts to UTF-16.
* <p>
* NOTE: Full characters are read, even if this reads past the length passed (and can result in an
* ArrayOutOfBoundsException if invalid UTF-8 is passed). Explicit checks for valid UTF-8 are not performed.
*
* @throws IOException
*/
public static String UTF8toUTF16( Source source )
throws IOException
{
int charLength = source.readInt();
int offset = 0;
final char[] out = new char[charLength];
while ( offset < charLength )
{
int b = source.readByte() & 0xff;
if ( b < 0xc0 )
{
assert b < 0x80;
out[offset++] = (char) b;
}
else if ( b < 0xe0 )
{
out[offset++] = (char) ( ( ( b & 0x1f ) << 6 ) + ( source.readByte() & 0x3f ) );
}
else if ( b < 0xf0 )
{
out[offset++] =
(char) ( ( ( b & 0xf ) << 12 ) + ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f ) );
}
else
{
assert b < 0xf8 : "b = 0x" + Integer.toHexString( b );
int ch =
( ( b & 0x7 ) << 18 ) + ( ( source.readByte() & 0x3f ) << 12 )
+ ( ( source.readByte() & 0x3f ) << 6 ) + ( source.readByte() & 0x3f );
if ( ch < UNI_MAX_BMP )
{
out[offset++] = (char) ch;
}
else
{
int chHalf = ch - 0x0010000;
out[offset++] = (char) ( ( chHalf >> 10 ) + 0xD800 );
out[offset++] = (char) ( ( chHalf & HALF_MASK ) + 0xDC00 );
}
}
}
if ( STRING_PP_CONSTRUCTOR != null )
{
return new String( out );
}
else
{
try
{
return STRING_PP_CONSTRUCTOR.newInstance( 0, out.length, out );
}
catch ( Exception e )
{
return new String( out );
}
}
}
}