| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.commons.imaging.util; |
| |
| import java.io.UnsupportedEncodingException; |
| |
| import org.apache.commons.imaging.common.BinaryConstants; |
| |
| public abstract class UnicodeUtils implements BinaryConstants |
| { |
| /** |
| * This class should never be instantiated. |
| */ |
| private UnicodeUtils() |
| { |
| } |
| |
| public static class UnicodeException extends Exception |
| { |
| public UnicodeException(String message) |
| { |
| super(message); |
| } |
| } |
| |
| // A default single-byte charset. |
| public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0; |
| public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1; |
| public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2; |
| public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3; |
| public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4; |
| public static final int CHAR_ENCODING_CODE_UTF_8 = 5; |
| public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1; |
| |
| // /* |
| // * Guess the character encoding of arbitrary character data in a data |
| // * buffer. |
| // * |
| // * The data may not run to the end of the buffer; it may be terminated. |
| // This |
| // * makes the problem much harder, since the character data may be followed |
| // * by arbitrary data. |
| // */ |
| // public static int guessCharacterEncoding(byte bytes[], int index) |
| // { |
| // int length = bytes.length - index; |
| // |
| // if (length < 1) |
| // return CHAR_ENCODING_CODE_AMBIGUOUS; |
| // |
| // if (length >= 2) |
| // { |
| // // look for BOM. |
| // |
| // int c1 = 0xff & bytes[index]; |
| // int c2 = 0xff & bytes[index + 1]; |
| // if (c1 == 0xFF && c2 == 0xFE) |
| // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; |
| // else if (c1 == 0xFE && c2 == 0xFF) |
| // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; |
| // } |
| // |
| // } |
| // |
| // /* |
| // * Guess the character encoding of arbitrary character data in a data |
| // * buffer. |
| // * |
| // * The data fills the entire buffer. If it is terminated, the terminator |
| // * byte(s) will be the last bytes in the buffer. |
| // * |
| // * This makes the problem a bit easier. |
| // */ |
| // public static int guessCharacterEncodingSimple(byte bytes[], int index) |
| // throws UnicodeException |
| // { |
| // int length = bytes.length - index; |
| // |
| // if (length < 1) |
| // return CHAR_ENCODING_CODE_AMBIGUOUS; |
| // |
| // if (length >= 2) |
| // { |
| // // identify or eliminate UTF-16 with a BOM. |
| // |
| // int c1 = 0xff & bytes[index]; |
| // int c2 = 0xff & bytes[index + 1]; |
| // if (c1 == 0xFF && c2 == 0xFE) |
| // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM; |
| // else if (c1 == 0xFE && c2 == 0xFF) |
| // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM; |
| // } |
| // |
| // if (length >= 2) |
| // { |
| // // look for optional double-byte terminator. |
| // |
| // int c1 = 0xff & bytes[bytes.length - 2]; |
| // int c2 = 0xff & bytes[bytes.length - 1]; |
| // if (c1 == 0 && c2 == 0) |
| // { |
| // // definitely a flavor of UTF-16. |
| // if (length % 2 != 0) |
| // throw new UnicodeException( |
| // "Character data with double-byte terminator has an odd length."); |
| // |
| // boolean mayHaveTerminator = true; |
| // boolean mustHaveTerminator = false; |
| // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( |
| // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, |
| // mayHaveTerminator, mustHaveTerminator); |
| // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( |
| // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, |
| // mayHaveTerminator, mustHaveTerminator); |
| // if ((!possibleBigEndian) && (!possibleLittleEndian)) |
| // throw new UnicodeException( |
| // "Invalid character data, possibly UTF-16."); |
| // if (possibleBigEndian && possibleLittleEndian) |
| // return CHAR_ENCODING_CODE_AMBIGUOUS; |
| // if (possibleBigEndian) |
| // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; |
| // if (possibleLittleEndian) |
| // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; |
| // } |
| // } |
| // |
| // List possibleEncodings = new ArrayList(); |
| // if (length % 2 == 0) |
| // { |
| // boolean mayHaveTerminator = true; |
| // boolean mustHaveTerminator = false; |
| // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM( |
| // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index, |
| // mayHaveTerminator, mustHaveTerminator); |
| // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM( |
| // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index, |
| // mayHaveTerminator, mustHaveTerminator); |
| // |
| // if (possibleBigEndian) |
| // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM; |
| // if (possibleLittleEndian) |
| // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM; |
| // } |
| // |
| // } |
| |
| public static final boolean isValidISO_8859_1(String s) |
| { |
| try |
| { |
| String roundtrip = new String(s.getBytes("ISO-8859-1"), |
| "ISO-8859-1"); |
| return s.equals(roundtrip); |
| } catch (UnsupportedEncodingException e) |
| { |
| // should never be thrown. |
| throw new RuntimeException("Error parsing string.", e); |
| } |
| } |
| |
| /* |
| * Return the index of the first utf-16 terminator (ie. two even-aligned |
| * nulls). If not found, return -1. |
| */ |
| private static int findFirstDoubleByteTerminator(byte bytes[], int index) |
| { |
| for (int i = index; i < bytes.length - 1; i += 2) |
| { |
| int c1 = 0xff & bytes[index]; |
| int c2 = 0xff & bytes[index + 1]; |
| if (c1 == 0 && c2 == 0) |
| return i; |
| } |
| return -1; |
| } |
| |
| public final int findEndWithTerminator(byte bytes[], int index) |
| throws UnicodeException |
| { |
| return findEnd(bytes, index, true); |
| } |
| |
| public final int findEndWithoutTerminator(byte bytes[], int index) |
| throws UnicodeException |
| { |
| return findEnd(bytes, index, false); |
| } |
| |
| protected abstract int findEnd(byte bytes[], int index, |
| boolean includeTerminator) throws UnicodeException; |
| |
| public static UnicodeUtils getInstance(int charEncodingCode) |
| throws UnicodeException |
| { |
| switch (charEncodingCode) |
| { |
| case CHAR_ENCODING_CODE_ISO_8859_1: |
| return new UnicodeMetricsASCII(); |
| case CHAR_ENCODING_CODE_UTF_8: |
| // Debug.debug("CHAR_ENCODING_CODE_UTF_8"); |
| return new UnicodeMetricsUTF8(); |
| case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM: |
| case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM: |
| // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM"); |
| return new UnicodeMetricsUTF16WithBOM(); |
| case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM: |
| return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN); |
| case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM: |
| return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN); |
| default: |
| throw new UnicodeException("Unknown char encoding code: " |
| + charEncodingCode); |
| } |
| } |
| |
| private static class UnicodeMetricsASCII extends UnicodeUtils |
| { |
| @Override |
| public int findEnd(byte bytes[], int index, boolean includeTerminator) |
| throws UnicodeException |
| { |
| for (int i = index; i < bytes.length; i++) |
| { |
| if (bytes[i] == 0) |
| return includeTerminator ? i + 1 : i; |
| } |
| return bytes.length; |
| // throw new UnicodeException("Terminator not found."); |
| } |
| } |
| |
| // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils |
| // { |
| // public int findEnd(byte bytes[], int index, boolean includeTerminator) |
| // throws UnicodeException |
| // { |
| // for (int i = index; i < bytes.length; i++) |
| // { |
| // if (bytes[i] == 0) |
| // return includeTerminator ? i + 1 : i; |
| // } |
| // return bytes.length; |
| // // throw new UnicodeException("Terminator not found."); |
| // } |
| // } |
| |
| private static class UnicodeMetricsUTF8 extends UnicodeUtils |
| { |
| |
| @Override |
| public int findEnd(byte bytes[], int index, boolean includeTerminator) |
| throws UnicodeException |
| { |
| // http://en.wikipedia.org/wiki/UTF-8 |
| |
| while (true) |
| { |
| if (index == bytes.length) |
| return bytes.length; |
| if (index > bytes.length) |
| throw new UnicodeException("Terminator not found."); |
| |
| int c1 = 0xff & bytes[index++]; |
| if (c1 == 0) |
| return includeTerminator ? index : index - 1; |
| else if (c1 <= 0x7f) |
| continue; |
| else if (c1 <= 0xDF) |
| { |
| if (index >= bytes.length) |
| throw new UnicodeException("Invalid unicode."); |
| |
| int c2 = 0xff & bytes[index++]; |
| if (c2 < 0x80 || c2 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| } else if (c1 <= 0xEF) |
| { |
| if (index >= bytes.length - 1) |
| throw new UnicodeException("Invalid unicode."); |
| |
| int c2 = 0xff & bytes[index++]; |
| if (c2 < 0x80 || c2 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| int c3 = 0xff & bytes[index++]; |
| if (c3 < 0x80 || c3 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| } else if (c1 <= 0xF4) |
| { |
| if (index >= bytes.length - 2) |
| throw new UnicodeException("Invalid unicode."); |
| |
| int c2 = 0xff & bytes[index++]; |
| if (c2 < 0x80 || c2 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| int c3 = 0xff & bytes[index++]; |
| if (c3 < 0x80 || c3 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| int c4 = 0xff & bytes[index++]; |
| if (c4 < 0x80 || c4 > 0xBF) |
| throw new UnicodeException("Invalid code point."); |
| } else |
| throw new UnicodeException("Invalid code point."); |
| } |
| } |
| } |
| |
| private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils |
| { |
| protected int byteOrder = BYTE_ORDER_BIG_ENDIAN; |
| |
| public UnicodeMetricsUTF16(int byteOrder) |
| { |
| this.byteOrder = byteOrder; |
| } |
| |
| public boolean isValid(byte bytes[], int index, |
| boolean mayHaveTerminator, boolean mustHaveTerminator) |
| { |
| // http://en.wikipedia.org/wiki/UTF-16/UCS-2 |
| |
| while (true) |
| { |
| if (index == bytes.length) |
| { |
| // end of buffer, no terminator found. |
| return !mustHaveTerminator; |
| } |
| |
| if (index >= bytes.length - 1) |
| { |
| // end of odd-length buffer, no terminator found. |
| return false; |
| } |
| |
| int c1 = 0xff & bytes[index++]; |
| int c2 = 0xff & bytes[index++]; |
| int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; |
| |
| if (c1 == 0 && c2 == 0) |
| { |
| // terminator found. |
| return mayHaveTerminator; |
| } |
| |
| if (msb1 >= 0xD8) |
| { |
| // Surrogate pair found. |
| |
| if (msb1 >= 0xDC) |
| { |
| // invalid first surrogate. |
| return false; |
| } |
| |
| if (index >= bytes.length - 1) |
| { |
| // missing second surrogate. |
| return false; |
| } |
| |
| // second word. |
| int c3 = 0xff & bytes[index++]; |
| int c4 = 0xff & bytes[index++]; |
| int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; |
| if (msb2 < 0xDC) |
| { |
| // invalid second surrogate. |
| return false; |
| } |
| } |
| } |
| } |
| |
| @Override |
| public int findEnd(byte bytes[], int index, boolean includeTerminator) |
| throws UnicodeException |
| { |
| // http://en.wikipedia.org/wiki/UTF-16/UCS-2 |
| |
| while (true) |
| { |
| if (index == bytes.length) |
| return bytes.length; |
| if (index > bytes.length - 1) |
| throw new UnicodeException("Terminator not found."); |
| |
| int c1 = 0xff & bytes[index++]; |
| int c2 = 0xff & bytes[index++]; |
| int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2; |
| |
| if (c1 == 0 && c2 == 0) |
| { |
| return includeTerminator ? index : index - 2; |
| } else if (msb1 >= 0xD8) |
| { |
| if (index > bytes.length - 1) |
| throw new UnicodeException("Terminator not found."); |
| |
| // second word. |
| int c3 = 0xff & bytes[index++]; |
| int c4 = 0xff & bytes[index++]; |
| int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4; |
| if (msb2 < 0xDC) |
| throw new UnicodeException("Invalid code point."); |
| } |
| } |
| } |
| } |
| |
| private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16 |
| { |
| |
| public UnicodeMetricsUTF16NoBOM(final int byteOrder) |
| { |
| super(byteOrder); |
| } |
| |
| } |
| |
| private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16 |
| { |
| |
| public UnicodeMetricsUTF16WithBOM() |
| { |
| super(BYTE_ORDER_BIG_ENDIAN); |
| } |
| |
| @Override |
| public int findEnd(byte bytes[], int index, boolean includeTerminator) |
| throws UnicodeException |
| { |
| // http://en.wikipedia.org/wiki/UTF-16/UCS-2 |
| |
| if (index >= bytes.length - 1) |
| throw new UnicodeException("Missing BOM."); |
| |
| int c1 = 0xff & bytes[index++]; |
| int c2 = 0xff & bytes[index++]; |
| if (c1 == 0xFF && c2 == 0xFE) |
| byteOrder = BYTE_ORDER_LITTLE_ENDIAN; |
| else if (c1 == 0xFE && c2 == 0xFF) |
| byteOrder = BYTE_ORDER_BIG_ENDIAN; |
| else |
| throw new UnicodeException("Invalid byte order mark."); |
| |
| return super.findEnd(bytes, index, includeTerminator); |
| } |
| } |
| |
| } |