src/main/java/org/apache/commons/imaging/util/UnicodeUtils.java - sanselan - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.commons.imaging.util;

 import java.io.UnsupportedEncodingException;

 import org.apache.commons.imaging.common.BinaryConstants;

 public abstract class UnicodeUtils implements BinaryConstants
 {
     /**
      * This class should never be instantiated.
      */
     private UnicodeUtils()
     {
     }

     public static class UnicodeException extends Exception
     {
         public UnicodeException(String message)
         {
             super(message);
         }
     }

     // A default single-byte charset.
     public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
     public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
     public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
     public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
     public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
     public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
     public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;

     // /*
     // * Guess the character encoding of arbitrary character data in a data
     // * buffer.
     // *
     // * The data may not run to the end of the buffer; it may be terminated.
     // This
     // * makes the problem much harder, since the character data may be followed
     // * by arbitrary data.
     // */
     // public static int guessCharacterEncoding(byte bytes[], int index)
     // {
     // int length = bytes.length - index;
     //
     // if (length < 1)
     // return CHAR_ENCODING_CODE_AMBIGUOUS;
     //
     // if (length >= 2)
     // {
     // // look for BOM.
     //
     // int c1 = 0xff & bytes[index];
     // int c2 = 0xff & bytes[index + 1];
     // if (c1 == 0xFF && c2 == 0xFE)
     // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
     // else if (c1 == 0xFE && c2 == 0xFF)
     // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
     // }
     //
     // }
     //
     // /*
     // * Guess the character encoding of arbitrary character data in a data
     // * buffer.
     // *
     // * The data fills the entire buffer. If it is terminated, the terminator
     // * byte(s) will be the last bytes in the buffer.
     // *
     // * This makes the problem a bit easier.
     // */
     // public static int guessCharacterEncodingSimple(byte bytes[], int index)
     // throws UnicodeException
     // {
     // int length = bytes.length - index;
     //
     // if (length < 1)
     // return CHAR_ENCODING_CODE_AMBIGUOUS;
     //
     // if (length >= 2)
     // {
     // // identify or eliminate UTF-16 with a BOM.
     //
     // int c1 = 0xff & bytes[index];
     // int c2 = 0xff & bytes[index + 1];
     // if (c1 == 0xFF && c2 == 0xFE)
     // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
     // else if (c1 == 0xFE && c2 == 0xFF)
     // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
     // }
     //
     // if (length >= 2)
     // {
     // // look for optional double-byte terminator.
     //
     // int c1 = 0xff & bytes[bytes.length - 2];
     // int c2 = 0xff & bytes[bytes.length - 1];
     // if (c1 == 0 && c2 == 0)
     // {
     // // definitely a flavor of UTF-16.
     // if (length % 2 != 0)
     // throw new UnicodeException(
     // "Character data with double-byte terminator has an odd length.");
     //
     // boolean mayHaveTerminator = true;
     // boolean mustHaveTerminator = false;
     // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
     // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
     // mayHaveTerminator, mustHaveTerminator);
     // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
     // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
     // mayHaveTerminator, mustHaveTerminator);
     // if ((!possibleBigEndian) && (!possibleLittleEndian))
     // throw new UnicodeException(
     // "Invalid character data, possibly UTF-16.");
     // if (possibleBigEndian && possibleLittleEndian)
     // return CHAR_ENCODING_CODE_AMBIGUOUS;
     // if (possibleBigEndian)
     // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
     // if (possibleLittleEndian)
     // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
     // }
     // }
     //
     // List possibleEncodings = new ArrayList();
     // if (length % 2 == 0)
     // {
     // boolean mayHaveTerminator = true;
     // boolean mustHaveTerminator = false;
     // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
     // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
     // mayHaveTerminator, mustHaveTerminator);
     // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
     // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
     // mayHaveTerminator, mustHaveTerminator);
     //
     // if (possibleBigEndian)
     // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
     // if (possibleLittleEndian)
     // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
     // }
     //
     // }

     public static final boolean isValidISO_8859_1(String s)
     {
         try
         {
             String roundtrip = new String(s.getBytes("ISO-8859-1"),
                     "ISO-8859-1");
             return s.equals(roundtrip);
         } catch (UnsupportedEncodingException e)
         {
             // should never be thrown.
             throw new RuntimeException("Error parsing string.", e);
         }
     }

     /*
      * Return the index of the first utf-16 terminator (ie. two even-aligned
      * nulls). If not found, return -1.
      */
     private static int findFirstDoubleByteTerminator(byte bytes[], int index)
     {
         for (int i = index; i < bytes.length - 1; i += 2)
         {
             int c1 = 0xff & bytes[index];
             int c2 = 0xff & bytes[index + 1];
             if (c1 == 0 && c2 == 0)
                 return i;
         }
         return -1;
     }

     public final int findEndWithTerminator(byte bytes[], int index)
             throws UnicodeException
     {
         return findEnd(bytes, index, true);
     }

     public final int findEndWithoutTerminator(byte bytes[], int index)
             throws UnicodeException
     {
         return findEnd(bytes, index, false);
     }

     protected abstract int findEnd(byte bytes[], int index,
             boolean includeTerminator) throws UnicodeException;

     public static UnicodeUtils getInstance(int charEncodingCode)
             throws UnicodeException
     {
         switch (charEncodingCode)
         {
         case CHAR_ENCODING_CODE_ISO_8859_1:
             return new UnicodeMetricsASCII();
         case CHAR_ENCODING_CODE_UTF_8:
             // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
             return new UnicodeMetricsUTF8();
         case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
         case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
             // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
             return new UnicodeMetricsUTF16WithBOM();
         case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
             return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
         case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
             return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
         default:
             throw new UnicodeException("Unknown char encoding code: "
                     + charEncodingCode);
         }
     }

     private static class UnicodeMetricsASCII extends UnicodeUtils
     {
         @Override
         public int findEnd(byte bytes[], int index, boolean includeTerminator)
                 throws UnicodeException
         {
             for (int i = index; i < bytes.length; i++)
             {
                 if (bytes[i] == 0)
                     return includeTerminator ? i + 1 : i;
             }
             return bytes.length;
             // throw new UnicodeException("Terminator not found.");
         }
     }

     // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
     // {
     // public int findEnd(byte bytes[], int index, boolean includeTerminator)
     // throws UnicodeException
     // {
     // for (int i = index; i < bytes.length; i++)
     // {
     // if (bytes[i] == 0)
     // return includeTerminator ? i + 1 : i;
     // }
     // return bytes.length;
     // // throw new UnicodeException("Terminator not found.");
     // }
     // }

     private static class UnicodeMetricsUTF8 extends UnicodeUtils
     {

         @Override
         public int findEnd(byte bytes[], int index, boolean includeTerminator)
                 throws UnicodeException
         {
             // http://en.wikipedia.org/wiki/UTF-8

             while (true)
             {
                 if (index == bytes.length)
                     return bytes.length;
                 if (index > bytes.length)
                     throw new UnicodeException("Terminator not found.");

                 int c1 = 0xff & bytes[index++];
                 if (c1 == 0)
                     return includeTerminator ? index : index - 1;
                 else if (c1 <= 0x7f)
                     continue;
                 else if (c1 <= 0xDF)
                 {
                     if (index >= bytes.length)
                         throw new UnicodeException("Invalid unicode.");

                     int c2 = 0xff & bytes[index++];
                     if (c2 < 0x80 || c2 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                 } else if (c1 <= 0xEF)
                 {
                     if (index >= bytes.length - 1)
                         throw new UnicodeException("Invalid unicode.");

                     int c2 = 0xff & bytes[index++];
                     if (c2 < 0x80 || c2 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                     int c3 = 0xff & bytes[index++];
                     if (c3 < 0x80 || c3 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                 } else if (c1 <= 0xF4)
                 {
                     if (index >= bytes.length - 2)
                         throw new UnicodeException("Invalid unicode.");

                     int c2 = 0xff & bytes[index++];
                     if (c2 < 0x80 || c2 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                     int c3 = 0xff & bytes[index++];
                     if (c3 < 0x80 || c3 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                     int c4 = 0xff & bytes[index++];
                     if (c4 < 0x80 || c4 > 0xBF)
                         throw new UnicodeException("Invalid code point.");
                 } else
                     throw new UnicodeException("Invalid code point.");
             }
         }
     }

     private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
     {
         protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;

         public UnicodeMetricsUTF16(int byteOrder)
         {
             this.byteOrder = byteOrder;
         }

         public boolean isValid(byte bytes[], int index,
                 boolean mayHaveTerminator, boolean mustHaveTerminator)
         {
             // http://en.wikipedia.org/wiki/UTF-16/UCS-2

             while (true)
             {
                 if (index == bytes.length)
                 {
                     // end of buffer, no terminator found.
                     return !mustHaveTerminator;
                 }

                 if (index >= bytes.length - 1)
                 {
                     // end of odd-length buffer, no terminator found.
                     return false;
                 }

                 int c1 = 0xff & bytes[index++];
                 int c2 = 0xff & bytes[index++];
                 int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

                 if (c1 == 0 && c2 == 0)
                 {
                     // terminator found.
                     return mayHaveTerminator;
                 }

                 if (msb1 >= 0xD8)
                 {
                     // Surrogate pair found.

                     if (msb1 >= 0xDC)
                     {
                         // invalid first surrogate.
                         return false;
                     }

                     if (index >= bytes.length - 1)
                     {
                         // missing second surrogate.
                         return false;
                     }

                     // second word.
                     int c3 = 0xff & bytes[index++];
                     int c4 = 0xff & bytes[index++];
                     int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
                     if (msb2 < 0xDC)
                     {
                         // invalid second surrogate.
                         return false;
                     }
                 }
             }
         }

         @Override
         public int findEnd(byte bytes[], int index, boolean includeTerminator)
                 throws UnicodeException
         {
             // http://en.wikipedia.org/wiki/UTF-16/UCS-2

             while (true)
             {
                 if (index == bytes.length)
                     return bytes.length;
                 if (index > bytes.length - 1)
                     throw new UnicodeException("Terminator not found.");

                 int c1 = 0xff & bytes[index++];
                 int c2 = 0xff & bytes[index++];
                 int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

                 if (c1 == 0 && c2 == 0)
                 {
                     return includeTerminator ? index : index - 2;
                 } else if (msb1 >= 0xD8)
                 {
                     if (index > bytes.length - 1)
                         throw new UnicodeException("Terminator not found.");

                     // second word.
                     int c3 = 0xff & bytes[index++];
                     int c4 = 0xff & bytes[index++];
                     int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
                     if (msb2 < 0xDC)
                         throw new UnicodeException("Invalid code point.");
                 }
             }
         }
     }

     private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
     {

         public UnicodeMetricsUTF16NoBOM(final int byteOrder)
         {
             super(byteOrder);
         }

     }

     private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
     {

         public UnicodeMetricsUTF16WithBOM()
         {
             super(BYTE_ORDER_BIG_ENDIAN);
         }

         @Override
         public int findEnd(byte bytes[], int index, boolean includeTerminator)
                 throws UnicodeException
         {
             // http://en.wikipedia.org/wiki/UTF-16/UCS-2

             if (index >= bytes.length - 1)
                 throw new UnicodeException("Missing BOM.");

             int c1 = 0xff & bytes[index++];
             int c2 = 0xff & bytes[index++];
             if (c1 == 0xFF && c2 == 0xFE)
                 byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
             else if (c1 == 0xFE && c2 == 0xFF)
                 byteOrder = BYTE_ORDER_BIG_ENDIAN;
             else
                 throw new UnicodeException("Invalid byte order mark.");

             return super.findEnd(bytes, index, includeTerminator);
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.commons.imaging.util;

	import java.io.UnsupportedEncodingException;

	import org.apache.commons.imaging.common.BinaryConstants;

	public abstract class UnicodeUtils implements BinaryConstants
	{
	/**
	* This class should never be instantiated.
	*/
	private UnicodeUtils()
	{
	}

	public static class UnicodeException extends Exception
	{
	public UnicodeException(String message)
	{
	super(message);
	}
	}

	// A default single-byte charset.
	public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
	public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
	public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
	public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
	public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
	public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
	public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;

	// /*
	// * Guess the character encoding of arbitrary character data in a data
	// * buffer.
	// *
	// * The data may not run to the end of the buffer; it may be terminated.
	// This
	// * makes the problem much harder, since the character data may be followed
	// * by arbitrary data.
	// */
	// public static int guessCharacterEncoding(byte bytes[], int index)
	// {
	// int length = bytes.length - index;
	//
	// if (length < 1)
	// return CHAR_ENCODING_CODE_AMBIGUOUS;
	//
	// if (length >= 2)
	// {
	// // look for BOM.
	//
	// int c1 = 0xff & bytes[index];
	// int c2 = 0xff & bytes[index + 1];
	// if (c1 == 0xFF && c2 == 0xFE)
	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
	// else if (c1 == 0xFE && c2 == 0xFF)
	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
	// }
	//
	// }
	//
	// /*
	// * Guess the character encoding of arbitrary character data in a data
	// * buffer.
	// *
	// * The data fills the entire buffer. If it is terminated, the terminator
	// * byte(s) will be the last bytes in the buffer.
	// *
	// * This makes the problem a bit easier.
	// */
	// public static int guessCharacterEncodingSimple(byte bytes[], int index)
	// throws UnicodeException
	// {
	// int length = bytes.length - index;
	//
	// if (length < 1)
	// return CHAR_ENCODING_CODE_AMBIGUOUS;
	//
	// if (length >= 2)
	// {
	// // identify or eliminate UTF-16 with a BOM.
	//
	// int c1 = 0xff & bytes[index];
	// int c2 = 0xff & bytes[index + 1];
	// if (c1 == 0xFF && c2 == 0xFE)
	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
	// else if (c1 == 0xFE && c2 == 0xFF)
	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
	// }
	//
	// if (length >= 2)
	// {
	// // look for optional double-byte terminator.
	//
	// int c1 = 0xff & bytes[bytes.length - 2];
	// int c2 = 0xff & bytes[bytes.length - 1];
	// if (c1 == 0 && c2 == 0)
	// {
	// // definitely a flavor of UTF-16.
	// if (length % 2 != 0)
	// throw new UnicodeException(
	// "Character data with double-byte terminator has an odd length.");
	//
	// boolean mayHaveTerminator = true;
	// boolean mustHaveTerminator = false;
	// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
	// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
	// mayHaveTerminator, mustHaveTerminator);
	// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
	// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
	// mayHaveTerminator, mustHaveTerminator);
	// if ((!possibleBigEndian) && (!possibleLittleEndian))
	// throw new UnicodeException(
	// "Invalid character data, possibly UTF-16.");
	// if (possibleBigEndian && possibleLittleEndian)
	// return CHAR_ENCODING_CODE_AMBIGUOUS;
	// if (possibleBigEndian)
	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
	// if (possibleLittleEndian)
	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
	// }
	// }
	//
	// List possibleEncodings = new ArrayList();
	// if (length % 2 == 0)
	// {
	// boolean mayHaveTerminator = true;
	// boolean mustHaveTerminator = false;
	// boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
	// BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
	// mayHaveTerminator, mustHaveTerminator);
	// boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
	// BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
	// mayHaveTerminator, mustHaveTerminator);
	//
	// if (possibleBigEndian)
	// return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
	// if (possibleLittleEndian)
	// return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
	// }
	//
	// }

	public static final boolean isValidISO_8859_1(String s)
	{
	try
	{
	String roundtrip = new String(s.getBytes("ISO-8859-1"),
	"ISO-8859-1");
	return s.equals(roundtrip);
	} catch (UnsupportedEncodingException e)
	{
	// should never be thrown.
	throw new RuntimeException("Error parsing string.", e);
	}
	}

	/*
	* Return the index of the first utf-16 terminator (ie. two even-aligned
	* nulls). If not found, return -1.
	*/
	private static int findFirstDoubleByteTerminator(byte bytes[], int index)
	{
	for (int i = index; i < bytes.length - 1; i += 2)
	{
	int c1 = 0xff & bytes[index];
	int c2 = 0xff & bytes[index + 1];
	if (c1 == 0 && c2 == 0)
	return i;
	}
	return -1;
	}

	public final int findEndWithTerminator(byte bytes[], int index)
	throws UnicodeException
	{
	return findEnd(bytes, index, true);
	}

	public final int findEndWithoutTerminator(byte bytes[], int index)
	throws UnicodeException
	{
	return findEnd(bytes, index, false);
	}

	protected abstract int findEnd(byte bytes[], int index,
	boolean includeTerminator) throws UnicodeException;

	public static UnicodeUtils getInstance(int charEncodingCode)
	throws UnicodeException
	{
	switch (charEncodingCode)
	{
	case CHAR_ENCODING_CODE_ISO_8859_1:
	return new UnicodeMetricsASCII();
	case CHAR_ENCODING_CODE_UTF_8:
	// Debug.debug("CHAR_ENCODING_CODE_UTF_8");
	return new UnicodeMetricsUTF8();
	case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
	case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
	// Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
	return new UnicodeMetricsUTF16WithBOM();
	case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
	return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
	case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
	return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
	default:
	throw new UnicodeException("Unknown char encoding code: "
	+ charEncodingCode);
	}
	}

	private static class UnicodeMetricsASCII extends UnicodeUtils
	{
	@Override
	public int findEnd(byte bytes[], int index, boolean includeTerminator)
	throws UnicodeException
	{
	for (int i = index; i < bytes.length; i++)
	{
	if (bytes[i] == 0)
	return includeTerminator ? i + 1 : i;
	}
	return bytes.length;
	// throw new UnicodeException("Terminator not found.");
	}
	}

	// private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
	// {
	// public int findEnd(byte bytes[], int index, boolean includeTerminator)
	// throws UnicodeException
	// {
	// for (int i = index; i < bytes.length; i++)
	// {
	// if (bytes[i] == 0)
	// return includeTerminator ? i + 1 : i;
	// }
	// return bytes.length;
	// // throw new UnicodeException("Terminator not found.");
	// }
	// }

	private static class UnicodeMetricsUTF8 extends UnicodeUtils
	{

	@Override
	public int findEnd(byte bytes[], int index, boolean includeTerminator)
	throws UnicodeException
	{
	// http://en.wikipedia.org/wiki/UTF-8

	while (true)
	{
	if (index == bytes.length)
	return bytes.length;
	if (index > bytes.length)
	throw new UnicodeException("Terminator not found.");

	int c1 = 0xff & bytes[index++];
	if (c1 == 0)
	return includeTerminator ? index : index - 1;
	else if (c1 <= 0x7f)
	continue;
	else if (c1 <= 0xDF)
	{
	if (index >= bytes.length)
	throw new UnicodeException("Invalid unicode.");

	int c2 = 0xff & bytes[index++];
	if (c2 < 0x80 \|\| c2 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	} else if (c1 <= 0xEF)
	{
	if (index >= bytes.length - 1)
	throw new UnicodeException("Invalid unicode.");

	int c2 = 0xff & bytes[index++];
	if (c2 < 0x80 \|\| c2 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	int c3 = 0xff & bytes[index++];
	if (c3 < 0x80 \|\| c3 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	} else if (c1 <= 0xF4)
	{
	if (index >= bytes.length - 2)
	throw new UnicodeException("Invalid unicode.");

	int c2 = 0xff & bytes[index++];
	if (c2 < 0x80 \|\| c2 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	int c3 = 0xff & bytes[index++];
	if (c3 < 0x80 \|\| c3 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	int c4 = 0xff & bytes[index++];
	if (c4 < 0x80 \|\| c4 > 0xBF)
	throw new UnicodeException("Invalid code point.");
	} else
	throw new UnicodeException("Invalid code point.");
	}
	}
	}

	private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
	{
	protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;

	public UnicodeMetricsUTF16(int byteOrder)
	{
	this.byteOrder = byteOrder;
	}

	public boolean isValid(byte bytes[], int index,
	boolean mayHaveTerminator, boolean mustHaveTerminator)
	{
	// http://en.wikipedia.org/wiki/UTF-16/UCS-2

	while (true)
	{
	if (index == bytes.length)
	{
	// end of buffer, no terminator found.
	return !mustHaveTerminator;
	}

	if (index >= bytes.length - 1)
	{
	// end of odd-length buffer, no terminator found.
	return false;
	}

	int c1 = 0xff & bytes[index++];
	int c2 = 0xff & bytes[index++];
	int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

	if (c1 == 0 && c2 == 0)
	{
	// terminator found.
	return mayHaveTerminator;
	}

	if (msb1 >= 0xD8)
	{
	// Surrogate pair found.

	if (msb1 >= 0xDC)
	{
	// invalid first surrogate.
	return false;
	}

	if (index >= bytes.length - 1)
	{
	// missing second surrogate.
	return false;
	}

	// second word.
	int c3 = 0xff & bytes[index++];
	int c4 = 0xff & bytes[index++];
	int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
	if (msb2 < 0xDC)
	{
	// invalid second surrogate.
	return false;
	}
	}
	}
	}

	@Override
	public int findEnd(byte bytes[], int index, boolean includeTerminator)
	throws UnicodeException
	{
	// http://en.wikipedia.org/wiki/UTF-16/UCS-2

	while (true)
	{
	if (index == bytes.length)
	return bytes.length;
	if (index > bytes.length - 1)
	throw new UnicodeException("Terminator not found.");

	int c1 = 0xff & bytes[index++];
	int c2 = 0xff & bytes[index++];
	int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;

	if (c1 == 0 && c2 == 0)
	{
	return includeTerminator ? index : index - 2;
	} else if (msb1 >= 0xD8)
	{
	if (index > bytes.length - 1)
	throw new UnicodeException("Terminator not found.");

	// second word.
	int c3 = 0xff & bytes[index++];
	int c4 = 0xff & bytes[index++];
	int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
	if (msb2 < 0xDC)
	throw new UnicodeException("Invalid code point.");
	}
	}
	}
	}

	private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
	{

	public UnicodeMetricsUTF16NoBOM(final int byteOrder)
	{
	super(byteOrder);
	}

	}

	private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
	{

	public UnicodeMetricsUTF16WithBOM()
	{
	super(BYTE_ORDER_BIG_ENDIAN);
	}

	@Override
	public int findEnd(byte bytes[], int index, boolean includeTerminator)
	throws UnicodeException
	{
	// http://en.wikipedia.org/wiki/UTF-16/UCS-2

	if (index >= bytes.length - 1)
	throw new UnicodeException("Missing BOM.");

	int c1 = 0xff & bytes[index++];
	int c2 = 0xff & bytes[index++];
	if (c1 == 0xFF && c2 == 0xFE)
	byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
	else if (c1 == 0xFE && c2 == 0xFF)
	byteOrder = BYTE_ORDER_BIG_ENDIAN;
	else
	throw new UnicodeException("Invalid byte order mark.");

	return super.findEnd(bytes, index, includeTerminator);
	}
	}

	}